In [None]:
import os
import requests
import time
import whisper
from pydub import AudioSegment
from pyannote.audio import Pipeline

# Function to download the audio file
def download_audio_file(link, file_name):
    try:
        response = requests.get(link, stream=True)
        if response.status_code == 200:
            with open(file_name, 'wb') as f:
                f.write(response.content)
            print(f"Audio file saved: {file_name}")
            return file_name
        else:
            print(f"Failed to download file: {link}")
            return None
    except Exception as e:
        print(f"Error downloading file: {str(e)}")
        return None

# Function to delete temporary audio files
def delete_audio_file(file_name):
    if os.path.exists(file_name):
        os.remove(file_name)
        print(f"Audio file deleted: {file_name}")


# Function to save the final transcription to an Excel file
def save_transcriptions_to_excel(transcriptions, output_file):
    import pandas as pd
    df = pd.DataFrame(transcriptions)
    df.to_excel(output_file, index=False)
    print(f"Transcriptions saved to {output_file}")


# Function to split audio by diarization intervals
def split_audio_by_diarization(audio_path, diarization, buffer=0.1):
    audio = AudioSegment.from_file(audio_path)
    segments = []
    
    for idx, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
        start_ms = max(0, int((turn.start - buffer) * 1000))  # Apply buffer, avoid negative times
        end_ms = int((turn.end + buffer) * 1000)  # Extend the end time
        segment_audio = audio[start_ms:end_ms]
        
        segment_path = f"segment_{idx}_speaker_{speaker}.wav"
        segment_audio.export(segment_path, format="wav")
        segments.append({"path": segment_path, "start": turn.start, "end": turn.end, "speaker": speaker})
    
    return segments

# Function to transcribe each audio segment
def transcribe_segments(segments, whisper_model):
    transcriptions = []
    for segment in segments:
        try:
            # Perform transcription
            print(f"Transcribing segment: {segment['path']} (Speaker: {segment['speaker']})")
            transcription = whisper_model.transcribe(segment["path"], word_timestamps=True)
            
            # Debug: Print the transcription result structure
            print(f"Transcription result for {segment['path']}: {transcription}")

            # Handle missing word-level timestamps
            if "words" not in transcription or not transcription["words"]:
                print(f"No words found for segment {segment['path']}. Using full text as fallback.")
                transcriptions.append({
                    "speaker": segment["speaker"],
                    "start": segment["start"],
                    "end": segment["end"],
                    "text": transcription["text"].strip()  # Use the full text as fallback
                })
                continue
            
            # Align transcription text to diarization timestamps
            speaker_text = []
            for word in transcription["words"]:
                if segment["start"] <= word["start"] <= segment["end"]:
                    speaker_text.append(word["text"])
            
            transcriptions.append({
                "speaker": segment["speaker"],
                "start": segment["start"],
                "end": segment["end"],
                "text": " ".join(speaker_text).strip()
            })

        except Exception as e:
            print(f"Error transcribing segment {segment['path']}: {e}")
            continue
    
    print("\n")
    print(transcription)

    return transcriptions

# Function to smooth boundaries and combine overlapping segments
def smooth_boundaries(transcriptions):
    smoothed = []
    current_segment = None

    for trans in transcriptions:
        if current_segment is None:
            current_segment = trans
        else:
            # If the current segment overlaps with the next, merge them
            if current_segment["end"] >= trans["start"]:
                current_segment["text"] += f" {trans['text']}"
                current_segment["end"] = trans["end"]
            else:
                smoothed.append(current_segment)
                current_segment = trans

    # Add the final segment
    if current_segment:
        smoothed.append(current_segment)
    
    return smoothed

# Main processing loop
def process_audio(reclinks, whisper_model, diarization_pipeline, output_file):
    all_transcriptions = []

    for rec_link in reclinks:
        print(f"Processing: {rec_link}")
        file_name = f"audio_temp_{int(time.time())}.mp3"
        file_name = download_audio_file(rec_link, file_name)

        if not file_name:
            continue

        try:
            # Convert MP3 to WAV with mono channel and 16kHz sample rate
            audio = AudioSegment.from_file(file_name, format="mp3")
            audio = audio.set_channels(1).set_frame_rate(16000)
            audio.export("converted_file.wav", format="wav")
            
            # Run diarization
            audio_path = "converted_file.wav"
            diarization = diarization_pipeline(audio_path, min_speakers=3, max_speakers=5)
            
            # Split audio based on diarization results
            segments = split_audio_by_diarization(audio_path, diarization)

            # Transcribe each segment
            transcriptions = transcribe_segments(segments, whisper_model)

            # Smooth boundaries and finalize transcriptions
            smoothed_transcriptions = smooth_boundaries(transcriptions)
            all_transcriptions.extend(smoothed_transcriptions)

        except Exception as e:
            print(f"Error processing {rec_link}: {e}")
        finally:
            # Clean up temporary files
            delete_audio_file(file_name)
            delete_audio_file("converted_file.wav")
            for segment in segments:
                if os.path.exists(segment["path"]):
                    delete_audio_file(segment["path"])
    
    # Save transcriptions to an output file
    save_transcriptions_to_excel(all_transcriptions, output_file)


# Main script
if __name__ == "__main__":
    # Configuration
    reclinks = ['https://media.plivo.com/v1/Account/MAYWQZMDLMMTY0YZHINJ/Recording/39160473-1ed2-447a-8efa-17da29ce74ae.mp3']
    output_file = "SD_pyannote_method2_single_run_output.xlsx"

    # Load models
    print("Loading Whisper model...")
    whisper_model = whisper.load_model("small")
    print("Loading diarization pipeline...")
    diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token="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")

    # Process audio files
    process_audio(reclinks, whisper_model, diarization_pipeline, output_file)


Loading Whisper model...
Loading diarization pipeline...


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.2+cu121. Bad things might happen unless you revert torch to 1.x.


INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder


Processing: https://media.plivo.com/v1/Account/MAYWQZMDLMMTY0YZHINJ/Recording/39160473-1ed2-447a-8efa-17da29ce74ae.mp3
Audio file saved: audio_temp_1733316577.mp3
Transcribing segment: segment_0_speaker_SPEAKER_02.wav (Speaker: SPEAKER_02)




Transcription result for segment_0_speaker_SPEAKER_02.wav: {'text': ' We are dialing out.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.88, 'text': ' We are dialing out.', 'tokens': [50364, 492, 366, 5502, 278, 484, 13, 50414], 'temperature': 0.0, 'avg_logprob': -0.3721461296081543, 'compression_ratio': 0.7037037037037037, 'no_speech_prob': 0.012154572643339634, 'words': [{'word': ' We', 'start': 0.0, 'end': 0.14, 'probability': 0.7910024523735046}, {'word': ' are', 'start': 0.14, 'end': 0.24, 'probability': 0.9533327221870422}, {'word': ' dialing', 'start': 0.24, 'end': 0.68, 'probability': 0.7783835232257843}, {'word': ' out.', 'start': 0.68, 'end': 0.88, 'probability': 0.9935570359230042}]}], 'language': 'en'}
No words found for segment segment_0_speaker_SPEAKER_02.wav. Using full text as fallback.
Transcribing segment: segment_1_speaker_SPEAKER_01.wav (Speaker: SPEAKER_01)




Transcription result for segment_1_speaker_SPEAKER_01.wav: {'text': " I'm putting good in the moment. How can I help you?", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 1.7, 'text': " I'm putting good in the moment. How can I help you?", 'tokens': [50364, 286, 478, 3372, 665, 294, 264, 1623, 13, 1012, 393, 286, 854, 291, 30, 50464], 'temperature': 0.0, 'avg_logprob': -0.7423443513758042, 'compression_ratio': 0.864406779661017, 'no_speech_prob': 0.025712566450238228, 'words': [{'word': " I'm", 'start': 0.0, 'end': 0.3, 'probability': 0.28815340995788574}, {'word': ' putting', 'start': 0.3, 'end': 0.5, 'probability': 0.10784363746643066}, {'word': ' good', 'start': 0.5, 'end': 0.68, 'probability': 0.5630713105201721}, {'word': ' in', 'start': 0.68, 'end': 0.8, 'probability': 0.0482206866145134}, {'word': ' the', 'start': 0.8, 'end': 0.82, 'probability': 0.7546097040176392}, {'word': ' moment.', 'start': 0.82, 'end': 1.02, 'probability': 0.4609586298465729}, {'word': ' How', 'st



Transcription result for segment_2_speaker_SPEAKER_00.wav: {'text': ' Thank you.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.1, 'text': ' Thank you.', 'tokens': [50364, 1044, 291, 13, 50414], 'temperature': 0.0, 'avg_logprob': -0.7428859869639078, 'compression_ratio': 0.5555555555555556, 'no_speech_prob': 0.6358894109725952, 'words': [{'word': ' Thank', 'start': 0.0, 'end': 0.0, 'probability': 0.15961474180221558}, {'word': ' you.', 'start': 0.0, 'end': 0.1, 'probability': 0.9876921772956848}]}], 'language': 'en'}
No words found for segment segment_2_speaker_SPEAKER_00.wav. Using full text as fallback.
Transcribing segment: segment_3_speaker_SPEAKER_00.wav (Speaker: SPEAKER_00)




Transcription result for segment_3_speaker_SPEAKER_00.wav: {'text': ' Hello, synagogically to Nathan Keele.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 2.22, 'text': ' Hello, synagogically to Nathan Keele.', 'tokens': [50364, 2425, 11, 5451, 31599, 984, 281, 20634, 591, 1653, 306, 13, 50480], 'temperature': 0.0, 'avg_logprob': -0.8955510003226144, 'compression_ratio': 0.8222222222222222, 'no_speech_prob': 0.28789567947387695, 'words': [{'word': ' Hello,', 'start': 0.0, 'end': 0.36, 'probability': 0.8271461129188538}, {'word': ' synagogically', 'start': 0.46, 'end': 1.12, 'probability': 0.42078932623068493}, {'word': ' to', 'start': 1.12, 'end': 1.4, 'probability': 0.6202367544174194}, {'word': ' Nathan', 'start': 1.4, 'end': 1.76, 'probability': 0.987514317035675}, {'word': ' Keele.', 'start': 1.76, 'end': 2.22, 'probability': 0.3777701333165169}]}], 'language': 'en'}
No words found for segment segment_3_speaker_SPEAKER_00.wav. Using full text as fallback.
Transcribing seg



Transcription result for segment_4_speaker_SPEAKER_00.wav: {'text': " Sorry, what's the name?", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.92, 'text': " Sorry, what's the name?", 'tokens': [50364, 4919, 11, 437, 311, 264, 1315, 30, 50420], 'temperature': 0.0, 'avg_logprob': -0.45357766151428225, 'compression_ratio': 0.7419354838709677, 'no_speech_prob': 0.06547130644321442, 'words': [{'word': ' Sorry,', 'start': 0.0, 'end': 0.34, 'probability': 0.7047063112258911}, {'word': " what's", 'start': 0.48, 'end': 0.68, 'probability': 0.8393945693969727}, {'word': ' the', 'start': 0.68, 'end': 0.78, 'probability': 0.5735313892364502}, {'word': ' name?', 'start': 0.78, 'end': 0.92, 'probability': 0.9969913959503174}]}], 'language': 'en'}
No words found for segment segment_4_speaker_SPEAKER_00.wav. Using full text as fallback.
Transcribing segment: segment_5_speaker_SPEAKER_00.wav (Speaker: SPEAKER_00)




Transcription result for segment_5_speaker_SPEAKER_00.wav: {'text': ' Nathan, N-A-T-H-A-N, and the family name is H-A-L-E-F.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 8.3, 'text': ' Nathan, N-A-T-H-A-N, and the family name is H-A-L-E-F.', 'tokens': [50364, 20634, 11, 426, 12, 32, 12, 51, 12, 39, 12, 32, 12, 45, 11, 293, 264, 1605, 1315, 307, 389, 12, 32, 12, 43, 12, 36, 12, 37, 13, 50782], 'temperature': 0.0, 'avg_logprob': -0.24265430867671967, 'compression_ratio': 0.9310344827586207, 'no_speech_prob': 0.3272314965724945, 'words': [{'word': ' Nathan,', 'start': 0.0, 'end': 0.56, 'probability': 0.7438210248947144}, {'word': ' N', 'start': 0.82, 'end': 1.2, 'probability': 0.7137832045555115}, {'word': '-A', 'start': 1.2, 'end': 1.54, 'probability': 0.8719706535339355}, {'word': '-T', 'start': 1.54, 'end': 1.86, 'probability': 0.9951997995376587}, {'word': '-H', 'start': 1.86, 'end': 2.44, 'probability': 0.9082989692687988}, {'word': '-A', 'start': 2.44, 'end': 2.7, 'proba



Transcription result for segment_6_speaker_SPEAKER_01.wav: {'text': ' Yeah, give me just one second.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 1.04, 'text': ' Yeah, give me just one second.', 'tokens': [50364, 865, 11, 976, 385, 445, 472, 1150, 13, 50464], 'temperature': 0.0, 'avg_logprob': -0.39890770478682086, 'compression_ratio': 0.7894736842105263, 'no_speech_prob': 0.14881139993667603, 'words': [{'word': ' Yeah,', 'start': 0.0, 'end': 0.26, 'probability': 0.597109317779541}, {'word': ' give', 'start': 0.42, 'end': 0.42, 'probability': 0.709945559501648}, {'word': ' me', 'start': 0.42, 'end': 0.54, 'probability': 0.9882634878158569}, {'word': ' just', 'start': 0.54, 'end': 0.7, 'probability': 0.991003155708313}, {'word': ' one', 'start': 0.7, 'end': 0.84, 'probability': 0.9540472030639648}, {'word': ' second.', 'start': 0.84, 'end': 1.04, 'probability': 0.9924619793891907}]}], 'language': 'en'}
No words found for segment segment_6_speaker_SPEAKER_01.wav. Using full t



Transcription result for segment_7_speaker_SPEAKER_01.wav: {'text': ' Okay.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.38, 'text': ' Okay.', 'tokens': [50364, 1033, 13, 50389], 'temperature': 0.0, 'avg_logprob': -0.5234713077545166, 'compression_ratio': 0.38461538461538464, 'no_speech_prob': 0.4865879416465759, 'words': [{'word': ' Okay.', 'start': 0.0, 'end': 0.38, 'probability': 0.517102062702179}]}], 'language': 'en'}
No words found for segment segment_7_speaker_SPEAKER_01.wav. Using full text as fallback.
Transcribing segment: segment_8_speaker_SPEAKER_02.wav (Speaker: SPEAKER_02)




Transcription result for segment_8_speaker_SPEAKER_02.wav: {'text': " Thank you for holding. We'll be with you in a moment.", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 2.1, 'text': " Thank you for holding. We'll be with you in a moment.", 'tokens': [50364, 1044, 291, 337, 5061, 13, 492, 603, 312, 365, 291, 294, 257, 1623, 13, 50480], 'temperature': 0.0, 'avg_logprob': -0.21255125718958237, 'compression_ratio': 0.9137931034482759, 'no_speech_prob': 0.060657184571027756, 'words': [{'word': ' Thank', 'start': 0.0, 'end': 0.26, 'probability': 0.8258600831031799}, {'word': ' you', 'start': 0.26, 'end': 0.38, 'probability': 0.9892039895057678}, {'word': ' for', 'start': 0.38, 'end': 0.5, 'probability': 0.9952461123466492}, {'word': ' holding.', 'start': 0.5, 'end': 0.72, 'probability': 0.9766720533370972}, {'word': " We'll", 'start': 1.06, 'end': 1.22, 'probability': 0.9705256819725037}, {'word': ' be', 'start': 1.22, 'end': 1.34, 'probability': 0.9993327260017395}, {'word': ' w



Transcription result for segment_9_speaker_SPEAKER_02.wav: {'text': " Thank you for calling out your go-to destination for all your outdoor needs. We're excited to announce our two new store locations in Boise, Idaho and St. George, Utah. At the heart of the state capital, our beautiful Boise store has everything you need to explore the hundreds of hiking and mountain biking trails, rivers, and other adventures around the Treasure Valley. Our St. George store is your one-stop shop to gear up for all your past", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 3.68, 'text': ' Thank you for calling out your go-to destination for all your outdoor needs.', 'tokens': [50364, 1044, 291, 337, 5141, 484, 428, 352, 12, 1353, 12236, 337, 439, 428, 15942, 2203, 13, 50568], 'temperature': 0.0, 'avg_logprob': -0.16953723230094553, 'compression_ratio': 1.6444444444444444, 'no_speech_prob': 0.1777353733778, 'words': [{'word': ' Thank', 'start': 0.0, 'end': 0.26, 'probability': 0.646910250186920



Transcription result for segment_10_speaker_SPEAKER_02.wav: {'text': '', 'segments': [], 'language': 'en'}
No words found for segment segment_10_speaker_SPEAKER_02.wav. Using full text as fallback.
Transcribing segment: segment_11_speaker_SPEAKER_00.wav (Speaker: SPEAKER_00)




Transcription result for segment_11_speaker_SPEAKER_00.wav: {'text': ' Thanks.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.3, 'text': ' Thanks.', 'tokens': [50364, 2561, 13, 50389], 'temperature': 0.0, 'avg_logprob': -0.8081751823425293, 'compression_ratio': 0.4666666666666667, 'no_speech_prob': 0.33692729473114014, 'words': [{'word': ' Thanks.', 'start': 0.0, 'end': 0.3, 'probability': 0.150005042552948}]}], 'language': 'en'}
No words found for segment segment_11_speaker_SPEAKER_00.wav. Using full text as fallback.
Transcribing segment: segment_12_speaker_SPEAKER_00.wav (Speaker: SPEAKER_00)




Transcription result for segment_12_speaker_SPEAKER_00.wav: {'text': ' Hello.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.32, 'text': ' Hello.', 'tokens': [50364, 2425, 13, 50389], 'temperature': 0.0, 'avg_logprob': -0.6372888565063477, 'compression_ratio': 0.42857142857142855, 'no_speech_prob': 0.054724711924791336, 'words': [{'word': ' Hello.', 'start': 0.0, 'end': 0.32, 'probability': 0.8154264092445374}]}], 'language': 'en'}
No words found for segment segment_12_speaker_SPEAKER_00.wav. Using full text as fallback.
Transcribing segment: segment_13_speaker_SPEAKER_00.wav (Speaker: SPEAKER_00)




Transcription result for segment_13_speaker_SPEAKER_00.wav: {'text': ' All at night.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.72, 'text': ' All at night.', 'tokens': [50364, 1057, 412, 1818, 13, 50408], 'temperature': 0.2, 'avg_logprob': -0.8021758624485561, 'compression_ratio': 0.6190476190476191, 'no_speech_prob': 0.19702544808387756, 'words': [{'word': ' All', 'start': 0.0, 'end': 0.28, 'probability': 0.21911203861236572}, {'word': ' at', 'start': 0.28, 'end': 0.44, 'probability': 0.657710611820221}, {'word': ' night.', 'start': 0.44, 'end': 0.72, 'probability': 0.41806280612945557}]}], 'language': 'en'}
No words found for segment segment_13_speaker_SPEAKER_00.wav. Using full text as fallback.
Transcribing segment: segment_14_speaker_SPEAKER_00.wav (Speaker: SPEAKER_00)




Transcription result for segment_14_speaker_SPEAKER_00.wav: {'text': " Hello, here's my speaking with Nathan Hales.", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 2.18, 'text': " Hello, here's my speaking with Nathan Hales.", 'tokens': [50364, 2425, 11, 510, 311, 452, 4124, 365, 20634, 389, 4229, 13, 50484], 'temperature': 0.0, 'avg_logprob': -0.8697436196463448, 'compression_ratio': 0.8461538461538461, 'no_speech_prob': 0.025332460179924965, 'words': [{'word': ' Hello,', 'start': 0.0, 'end': 0.3, 'probability': 0.5455479621887207}, {'word': " here's", 'start': 0.46, 'end': 0.68, 'probability': 0.5649552494287491}, {'word': ' my', 'start': 0.68, 'end': 0.78, 'probability': 0.05590282008051872}, {'word': ' speaking', 'start': 0.78, 'end': 1.16, 'probability': 0.917904794216156}, {'word': ' with', 'start': 1.16, 'end': 1.42, 'probability': 0.9805908799171448}, {'word': ' Nathan', 'start': 1.42, 'end': 1.74, 'probability': 0.9904085397720337}, {'word': ' Hales.', 'start': 1.74, 



Transcription result for segment_15_speaker_SPEAKER_00.wav: {'text': " You are? I'm going to do four.", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.42, 'text': ' You are?', 'tokens': [50364, 509, 366, 30, 50389], 'temperature': 0.0, 'avg_logprob': -0.7427294413248698, 'compression_ratio': 0.7894736842105263, 'no_speech_prob': 0.21907520294189453, 'words': [{'word': ' You', 'start': 0.0, 'end': 0.2, 'probability': 0.5747326016426086}, {'word': ' are?', 'start': 0.2, 'end': 0.42, 'probability': 0.9893882274627686}]}, {'id': 1, 'seek': 0, 'start': 0.84, 'end': 1.56, 'text': " I'm going to do four.", 'tokens': [50389, 286, 478, 516, 281, 360, 1451, 13, 50445], 'temperature': 0.0, 'avg_logprob': -0.7427294413248698, 'compression_ratio': 0.7894736842105263, 'no_speech_prob': 0.21907520294189453, 'words': [{'word': " I'm", 'start': 0.84, 'end': 1.06, 'probability': 0.5633175522089005}, {'word': ' going', 'start': 1.06, 'end': 1.2, 'probability': 0.7962063550949097}, {'word': ' to



Transcription result for segment_16_speaker_SPEAKER_01.wav: {'text': ' I need a pen.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.6, 'text': ' I need a pen.', 'tokens': [50364, 286, 643, 257, 3435, 13, 50406], 'temperature': 0.0, 'avg_logprob': -0.9553088545799255, 'compression_ratio': 0.6190476190476191, 'no_speech_prob': 0.30949026346206665, 'words': [{'word': ' I', 'start': 0.0, 'end': 0.16, 'probability': 0.10198864340782166}, {'word': ' need', 'start': 0.16, 'end': 0.3, 'probability': 0.574360191822052}, {'word': ' a', 'start': 0.3, 'end': 0.46, 'probability': 0.2601879835128784}, {'word': ' pen.', 'start': 0.46, 'end': 0.6, 'probability': 0.6525436639785767}]}], 'language': 'en'}
No words found for segment segment_16_speaker_SPEAKER_01.wav. Using full text as fallback.
Transcribing segment: segment_17_speaker_SPEAKER_01.wav (Speaker: SPEAKER_01)




Transcription result for segment_17_speaker_SPEAKER_01.wav: {'text': ' Yeah, my name is Hina calling you on behalf of Amazon Web Services.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 4.0, 'text': ' Yeah, my name is Hina calling you on behalf of Amazon Web Services.', 'tokens': [50364, 865, 11, 452, 1315, 307, 389, 1426, 5141, 291, 322, 9490, 295, 6795, 9573, 12124, 13, 50576], 'temperature': 0.0, 'avg_logprob': -0.24825733586361534, 'compression_ratio': 0.9054054054054054, 'no_speech_prob': 0.030562950298190117, 'words': [{'word': ' Yeah,', 'start': 0.0, 'end': 0.3, 'probability': 0.6111432313919067}, {'word': ' my', 'start': 0.5, 'end': 0.6, 'probability': 0.9783663749694824}, {'word': ' name', 'start': 0.6, 'end': 0.76, 'probability': 0.9982544779777527}, {'word': ' is', 'start': 0.76, 'end': 0.9, 'probability': 0.9945533275604248}, {'word': ' Hina', 'start': 0.9, 'end': 1.22, 'probability': 0.7508993744850159}, {'word': ' calling', 'start': 1.22, 'end': 1.56, 'probabili



Transcription result for segment_18_speaker_SPEAKER_01.wav: {'text': ' Have aU', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.48, 'text': ' Have aU', 'tokens': [50364, 3560, 257, 52, 50389], 'temperature': 0.0, 'avg_logprob': -0.9884809652964274, 'compression_ratio': 0.4666666666666667, 'no_speech_prob': 0.286200612783432, 'words': [{'word': ' Have', 'start': 0.0, 'end': 0.26, 'probability': 0.22481946647167206}, {'word': ' aU', 'start': 0.26, 'end': 0.48, 'probability': 0.5558903068304062}]}], 'language': 'en'}
No words found for segment segment_18_speaker_SPEAKER_01.wav. Using full text as fallback.
Transcribing segment: segment_19_speaker_SPEAKER_01.wav (Speaker: SPEAKER_01)




Transcription result for segment_19_speaker_SPEAKER_01.wav: {'text': ' And how are you?', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.66, 'text': ' And how are you?', 'tokens': [50364, 400, 577, 366, 291, 30, 50404], 'temperature': 0.0, 'avg_logprob': -0.4707317352294922, 'compression_ratio': 0.6666666666666666, 'no_speech_prob': 0.07671777158975601, 'words': [{'word': ' And', 'start': 0.0, 'end': 0.24, 'probability': 0.4842868745326996}, {'word': ' how', 'start': 0.24, 'end': 0.42, 'probability': 0.9234921336174011}, {'word': ' are', 'start': 0.42, 'end': 0.52, 'probability': 0.9933613538742065}, {'word': ' you?', 'start': 0.52, 'end': 0.66, 'probability': 0.9981922507286072}]}], 'language': 'en'}
No words found for segment segment_19_speaker_SPEAKER_01.wav. Using full text as fallback.
Transcribing segment: segment_20_speaker_SPEAKER_01.wav (Speaker: SPEAKER_01)




Transcription result for segment_20_speaker_SPEAKER_01.wav: {'text': ' I am good. Thank you so much for asking me. So Nathan, we have noticed that your business has been using Amazon Web Services.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 8.16, 'text': ' I am good. Thank you so much for asking me. So Nathan, we have noticed that your business has been using Amazon Web Services.', 'tokens': [50364, 286, 669, 665, 13, 1044, 291, 370, 709, 337, 3365, 385, 13, 407, 20634, 11, 321, 362, 5694, 300, 428, 1606, 575, 668, 1228, 6795, 9573, 12124, 13, 50789], 'temperature': 0.0, 'avg_logprob': -0.3460404795985068, 'compression_ratio': 1.1682242990654206, 'no_speech_prob': 0.054740387946367264, 'words': [{'word': ' I', 'start': 0.0, 'end': 0.2, 'probability': 0.29339534044265747}, {'word': ' am', 'start': 0.2, 'end': 0.28, 'probability': 0.5140935182571411}, {'word': ' good.', 'start': 0.28, 'end': 0.44, 'probability': 0.8098663091659546}, {'word': ' Thank', 'start': 0.62, 'end': 0



Transcription result for segment_21_speaker_SPEAKER_01.wav: {'text': ' And we just wanted to share a limited time offer that could help you leverage AI technology to grow your business as it is an exciting opportunity for small and medium businesses like yours to integrate, generate you AI into your operations.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.36, 'text': ' And we just wanted to share a limited time offer that could help you leverage AI technology', 'tokens': [50364, 400, 321, 445, 1415, 281, 2073, 257, 5567, 565, 2626, 300, 727, 854, 291, 13982, 7318, 2899, 50660], 'temperature': 0.0, 'avg_logprob': -0.1500837288650812, 'compression_ratio': 1.4606060606060607, 'no_speech_prob': 0.005403412040323019, 'words': [{'word': ' And', 'start': 0.0, 'end': 0.24, 'probability': 0.4435684084892273}, {'word': ' we', 'start': 0.24, 'end': 0.4, 'probability': 0.9697312712669373}, {'word': ' just', 'start': 0.4, 'end': 0.6, 'probability': 0.9717953205108643}, {'word': ' want



Transcription result for segment_22_speaker_SPEAKER_01.wav: {'text': " And if you're just interested in learning how a Generative AI Foundation could benefit your specific business needs, we are offering a program that creates a best practice Generative AI Foundation with all the tools needed to quickly integrate AI into your business. So Nathan, so would you just like to register your interest for this limited time offer? And I'll just share the details on your email.", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.58, 'text': " And if you're just interested in learning how a Generative AI Foundation could benefit your", 'tokens': [50364, 400, 498, 291, 434, 445, 3102, 294, 2539, 577, 257, 15409, 1166, 7318, 10335, 727, 5121, 428, 50644], 'temperature': 0.0, 'avg_logprob': -0.12906288806303048, 'compression_ratio': 1.6519823788546255, 'no_speech_prob': 0.0006392851355485618, 'words': [{'word': ' And', 'start': 0.0, 'end': 0.28, 'probability': 0.6802835464477539}, {'word': '



Transcription result for segment_23_speaker_SPEAKER_00.wav: {'text': " I'm not interested. Thank you for the call.", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 2.0, 'text': " I'm not interested. Thank you for the call.", 'tokens': [50364, 286, 478, 406, 3102, 13, 1044, 291, 337, 264, 818, 13, 50464], 'temperature': 0.0, 'avg_logprob': -0.2763837916510446, 'compression_ratio': 0.8431372549019608, 'no_speech_prob': 0.013703166507184505, 'words': [{'word': " I'm", 'start': 0.0, 'end': 0.34, 'probability': 0.8163845241069794}, {'word': ' not', 'start': 0.34, 'end': 0.58, 'probability': 0.9817276000976562}, {'word': ' interested.', 'start': 0.58, 'end': 0.98, 'probability': 0.9914417862892151}, {'word': ' Thank', 'start': 1.24, 'end': 1.38, 'probability': 0.9914544820785522}, {'word': ' you', 'start': 1.38, 'end': 1.5, 'probability': 0.9971439242362976}, {'word': ' for', 'start': 1.5, 'end': 1.64, 'probability': 0.9964447617530823}, {'word': ' the', 'start': 1.64, 'end': 1.74, '



Transcription result for segment_24_speaker_SPEAKER_01.wav: {'text': " So can I just share the details on your email whenever you're free you can just take a look", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 4.44, 'text': " So can I just share the details on your email whenever you're free you can just take a look", 'tokens': [50364, 407, 393, 286, 445, 2073, 264, 4365, 322, 428, 3796, 5699, 291, 434, 1737, 291, 393, 445, 747, 257, 574, 50587], 'temperature': 0.0, 'avg_logprob': -0.25217112250950025, 'compression_ratio': 1.1375, 'no_speech_prob': 0.05305175483226776, 'words': [{'word': ' So', 'start': 0.0, 'end': 0.52, 'probability': 0.5242086052894592}, {'word': ' can', 'start': 0.52, 'end': 0.78, 'probability': 0.6534179449081421}, {'word': ' I', 'start': 0.78, 'end': 0.92, 'probability': 0.7181582450866699}, {'word': ' just', 'start': 0.92, 'end': 0.98, 'probability': 0.836433470249176}, {'word': ' share', 'start': 0.98, 'end': 1.3, 'probability': 0.9836500287055969}, {'



Transcription result for segment_25_speaker_SPEAKER_00.wav: {'text': " Sure, it's Nate and they te at ALS.com", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 4.04, 'text': " Sure, it's Nate and they te at ALS.com", 'tokens': [50364, 4894, 11, 309, 311, 28064, 293, 436, 535, 412, 7056, 50, 13, 1112, 50566], 'temperature': 0.0, 'avg_logprob': -0.5308007597923279, 'compression_ratio': 0.8837209302325582, 'no_speech_prob': 0.013553917407989502, 'words': [{'word': ' Sure,', 'start': 0.0, 'end': 0.5, 'probability': 0.45027413964271545}, {'word': " it's", 'start': 0.96, 'end': 1.14, 'probability': 0.9402559101581573}, {'word': ' Nate', 'start': 1.14, 'end': 1.48, 'probability': 0.8939175605773926}, {'word': ' and', 'start': 1.48, 'end': 1.96, 'probability': 0.22959525883197784}, {'word': ' they', 'start': 1.96, 'end': 2.12, 'probability': 0.47536250948905945}, {'word': ' te', 'start': 2.12, 'end': 2.48, 'probability': 0.20547042787075043}, {'word': ' at', 'start': 2.48, 'end': 2.98, 



Transcription result for segment_26_speaker_SPEAKER_01.wav: {'text': " It's natee at ALS.com.", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 3.1, 'text': " It's natee at ALS.com.", 'tokens': [50364, 467, 311, 297, 473, 68, 412, 7056, 50, 13, 1112, 13, 50528], 'temperature': 0.0, 'avg_logprob': -0.5778978211539132, 'compression_ratio': 0.7333333333333333, 'no_speech_prob': 0.04272841289639473, 'words': [{'word': " It's", 'start': 0.0, 'end': 0.4, 'probability': 0.7159433364868164}, {'word': ' natee', 'start': 0.4, 'end': 1.36, 'probability': 0.4526776274045308}, {'word': ' at', 'start': 1.36, 'end': 1.74, 'probability': 0.010952002368867397}, {'word': ' ALS', 'start': 1.74, 'end': 2.64, 'probability': 0.69868203997612}, {'word': '.com.', 'start': 2.64, 'end': 3.1, 'probability': 0.9340177774429321}]}], 'language': 'en'}
No words found for segment segment_26_speaker_SPEAKER_01.wav. Using full text as fallback.
Transcribing segment: segment_27_speaker_SPEAKER_01.wav (Speaker: SP



Transcription result for segment_27_speaker_SPEAKER_01.wav: {'text': ' All right.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.24, 'text': ' All right.', 'tokens': [50364, 1057, 558, 13, 50389], 'temperature': 0.0, 'avg_logprob': -0.676389217376709, 'compression_ratio': 0.5555555555555556, 'no_speech_prob': 0.18005572259426117, 'words': [{'word': ' All', 'start': 0.0, 'end': 0.18, 'probability': 0.38440045714378357}, {'word': ' right.', 'start': 0.18, 'end': 0.24, 'probability': 0.9917033314704895}]}], 'language': 'en'}
No words found for segment segment_27_speaker_SPEAKER_01.wav. Using full text as fallback.
Transcribing segment: segment_28_speaker_SPEAKER_01.wav (Speaker: SPEAKER_01)




Transcription result for segment_28_speaker_SPEAKER_01.wav: {'text': ' Okay, so and you are working as one of the director of operations at AI Sporting Goods Care.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.38, 'text': ' Okay, so and you are working as one of the director of operations at AI Sporting Goods Care.', 'tokens': [50364, 1033, 11, 370, 293, 291, 366, 1364, 382, 472, 295, 264, 5391, 295, 7705, 412, 7318, 17549, 278, 2205, 82, 9532, 13, 50636], 'temperature': 0.0, 'avg_logprob': -0.4634378814697266, 'compression_ratio': 1.0454545454545454, 'no_speech_prob': 0.06738585978746414, 'words': [{'word': ' Okay,', 'start': 0.0, 'end': 0.44, 'probability': 0.24611727893352509}, {'word': ' so', 'start': 0.74, 'end': 1.0, 'probability': 0.767005443572998}, {'word': ' and', 'start': 1.0, 'end': 1.3, 'probability': 0.8589537739753723}, {'word': ' you', 'start': 1.3, 'end': 1.44, 'probability': 0.9426672458648682}, {'word': ' are', 'start': 1.44, 'end': 1.5, 'probability': 0



Transcription result for segment_29_speaker_SPEAKER_00.wav: {'text': ' All right.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 0.26, 'text': ' All right.', 'tokens': [50364, 1057, 558, 13, 50442], 'temperature': 0.0, 'avg_logprob': -0.6677219072977701, 'compression_ratio': 0.5555555555555556, 'no_speech_prob': 0.12638649344444275, 'words': [{'word': ' All', 'start': 0.0, 'end': 0.2, 'probability': 0.4724128246307373}, {'word': ' right.', 'start': 0.2, 'end': 0.26, 'probability': 0.9929537177085876}]}], 'language': 'en'}
No words found for segment segment_29_speaker_SPEAKER_00.wav. Using full text as fallback.
Transcribing segment: segment_30_speaker_SPEAKER_01.wav (Speaker: SPEAKER_01)




Transcription result for segment_30_speaker_SPEAKER_01.wav: {'text': " Okay, so what I'll do Nathan, I'll share all the details on your email and you can take a look. And so just I can opt you in. Thank you. And would you like AWS to share your contact information title company and the details of this project with Innovative Solutions so that they may contact you by email, post or phone to assist you. And in addition to your name, email and phone number, Innovative Solutions would be provided with the project title, project description, project revenue estimates to AWS and the project timeline. Okay.", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.18, 'text': " Okay, so what I'll do Nathan, I'll share all the details on your email and you can take a look.", 'tokens': [50364, 1033, 11, 370, 437, 286, 603, 360, 20634, 11, 286, 603, 2073, 439, 264, 4365, 322, 428, 3796, 293, 291, 393, 747, 257, 574, 13, 50632], 'temperature': 0.0, 'avg_logprob': -0.1542563065238621, 'compressio



Transcription result for segment_31_speaker_SPEAKER_00.wav: {'text': " No, don't share my information.", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 1.24, 'text': " No, don't share my information.", 'tokens': [50364, 883, 11, 500, 380, 2073, 452, 1589, 13, 50444], 'temperature': 0.0, 'avg_logprob': -0.3889002799987793, 'compression_ratio': 0.7948717948717948, 'no_speech_prob': 0.03440553694963455, 'words': [{'word': ' No,', 'start': 0.0, 'end': 0.44, 'probability': 0.7848523855209351}, {'word': " don't", 'start': 0.58, 'end': 0.76, 'probability': 0.9876767098903656}, {'word': ' share', 'start': 0.76, 'end': 0.86, 'probability': 0.452224463224411}, {'word': ' my', 'start': 0.86, 'end': 0.98, 'probability': 0.9882431626319885}, {'word': ' information.', 'start': 0.98, 'end': 1.24, 'probability': 0.996026873588562}]}], 'language': 'en'}
No words found for segment segment_31_speaker_SPEAKER_00.wav. Using full text as fallback.
Transcribing segment: segment_32_speaker_SPEAKER_01.



Transcription result for segment_32_speaker_SPEAKER_01.wav: {'text': ' Okay, no problem. Thank you.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 1.5, 'text': ' Okay, no problem. Thank you.', 'tokens': [50364, 1033, 11, 572, 1154, 13, 1044, 291, 13, 50444], 'temperature': 0.0, 'avg_logprob': -0.4402239105918191, 'compression_ratio': 0.7777777777777778, 'no_speech_prob': 0.046502936631441116, 'words': [{'word': ' Okay,', 'start': 0.0, 'end': 0.36, 'probability': 0.3680709898471832}, {'word': ' no', 'start': 0.44, 'end': 0.5, 'probability': 0.9960391521453857}, {'word': ' problem.', 'start': 0.5, 'end': 0.9, 'probability': 0.9946098327636719}, {'word': ' Thank', 'start': 1.1, 'end': 1.24, 'probability': 0.9953155517578125}, {'word': ' you.', 'start': 1.24, 'end': 1.5, 'probability': 0.9957893490791321}]}], 'language': 'en'}
No words found for segment segment_32_speaker_SPEAKER_01.wav. Using full text as fallback.


{'text': ' Okay, no problem. Thank you.', 'segments': [{'id': 