<a href="https://colab.research.google.com/github/Szozan/2d-strategic-plan/blob/main/deliverables/Step_5_Speech_Recognition_with_Speaker_Diarisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This script uses a local transcription model "Vosk" to ingest an MP3 file and transcribe it with speaker diarisation (labeling the speakers' names or identity). The transcript is saved as both a text file and JSON file.

In [None]:
# ===== SETUP =====
print("Setting up the environment...")
import sys
import subprocess

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# List of required packages
required_packages = ['numpy', 'pydub', 'vosk']

for package in required_packages:
    print(f"Installing {package}...")
    install(package)

print("Setup complete. All required packages have been installed.")


Setting up the environment...
Installing numpy...
Installing pydub...
Installing vosk...
Setup complete. All required packages have been installed.


In [None]:
# ===== IMPORTS =====
import json
from collections import defaultdict
import numpy as np
import io
import os
from pydub import AudioSegment
from vosk import Model, KaldiRecognizer, SpkModel

print("All necessary modules imported successfully.")

All necessary modules imported successfully.


In [None]:
# ===== GOOGLE DRIVE MOUNTING =====
def mount_google_drive():
    """Mount Google Drive to access files stored there."""
    print("Attempting to mount Google Drive...")
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        print("Google Drive successfully mounted.")
    except ImportError:
        print("Google Colab module not found. Are you running this in Google Colab?")
    except Exception as e:
        print(f"An error occurred while mounting Google Drive: {e}")

In [None]:
# ===== AUDIO CONVERSION =====
def convert_mp3_to_wav(mp3_file_path, output_format="wav", channels=1, frame_rate=16000):
    """Convert an MP3 file to WAV format suitable for transcription."""
    print(f"Converting MP3 file: {mp3_file_path}")
    try:
        audio = AudioSegment.from_file(mp3_file_path, format="mp3")
        audio = audio.set_channels(channels).set_frame_rate(frame_rate)
        wav_io = io.BytesIO()
        audio.export(wav_io, format=output_format)
        wav_io.seek(0)
        print("MP3 to WAV conversion completed.")
        return wav_io
    except Exception as e:
        print(f"Error during MP3 to WAV conversion: {e}")
        return None

In [None]:
# ===== TRANSCRIPTION AND DIARIZATION =====
def transcribe_with_diarization(wav_stream, model_path, spk_model_path, num_speakers=2):
    print("Initializing speech recognition model...")
    try:
        model = Model(model_path)
        spk_model = SpkModel(spk_model_path)
        rec = KaldiRecognizer(model, 16000)
        rec.SetSpkModel(spk_model)

        print("Starting transcription and diarization process...")
        results = []
        total_audio_processed = 0
        while True:
            data = wav_stream.read(4000)
            if len(data) == 0:
                break
            total_audio_processed += len(data)
            if rec.AcceptWaveform(data):
                result = json.loads(rec.Result())
                print(f"Intermediate result: {result}")
                results.append(result)

        final_result = json.loads(rec.FinalResult())
        print(f"Final result: {final_result}")
        results.append(final_result)

        print(f"Transcription and diarization completed. Total audio processed: {total_audio_processed} bytes")
        print(f"Number of results: {len(results)}")
        return results
    except Exception as e:
        print(f"Error during transcription and diarization: {e}")
        return None

In [None]:
# ===== UTTERANCE PROCESSING =====
def cosine_similarity(a, b):
    """Calculate the cosine similarity between two vectors."""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def group_into_utterances(transcription_results, max_words_per_utterance=15):
    print("Processing transcription results into utterances...")
    utterances = []
    current_utterance = defaultdict(list)
    speaker_vectors = []
    total_words = sum(len(result['text'].split()) for result in transcription_results)
    total_duration = 60  # Assume 60 seconds if we don't have actual duration
    avg_word_duration = total_duration / total_words

    current_time = 0
    for result in transcription_results:
        if 'text' in result and 'spk' in result:
            spk_vector = np.array(result['spk'])

            # Determine the speaker
            if not speaker_vectors:
                speaker_vectors.append(spk_vector)
                current_speaker = "Speaker 1"
            else:
                similarity = cosine_similarity(spk_vector, speaker_vectors[0])
                if similarity > 0.95:  # You can adjust this threshold
                    current_speaker = "Speaker 1"
                else:
                    if len(speaker_vectors) == 1:
                        speaker_vectors.append(spk_vector)
                    current_speaker = "Speaker 2"

            # Split the text into words
            words = result['text'].split()

            for word in words:
                current_utterance['words'].append(word)
                if not current_utterance['start']:
                    current_utterance['start'] = current_time
                current_time += avg_word_duration
                current_utterance['end'] = current_time
                current_utterance['speaker'] = current_speaker

                if len(current_utterance['words']) >= max_words_per_utterance:
                    utterances.append(dict(current_utterance))
                    current_utterance = defaultdict(list)

    # Add any remaining words as an utterance
    if current_utterance['words']:
        utterances.append(dict(current_utterance))

    print(f"Processed {len(utterances)} utterances.")
    return utterances

In [None]:
# ===== MAIN EXECUTION =====
def main():
    # Mount Google Drive
    mount_google_drive()

    # Set your file paths here with the correct paths
    MP3_FILE_PATH = '/content/drive/MyDrive/ISEA_Test_Audio/Test_audio.mp3'
    MODEL_PATH = '/content/drive/MyDrive/ISEA_Test_Audio/vosk-model-en-us-0.22'
    SPK_MODEL_PATH = '/content/drive/MyDrive/ISEA_Test_Audio/vosk-model-en-us-0.22/spk_model'
    OUTPUT_DIR = '/content/drive/MyDrive/ISEA_Test_Audio'

    print(f"\nUsing the following paths:")
    print(f"MP3 File: {MP3_FILE_PATH}")
    print(f"Vosk Model: {MODEL_PATH}")
    print(f"Speaker Model: {SPK_MODEL_PATH}")
    print(f"Output Directory: {OUTPUT_DIR}")

    # Check if files and directories exist
    if not os.path.exists(MP3_FILE_PATH):
        print(f"Error: MP3 file not found at {MP3_FILE_PATH}")
        return
    if not os.path.exists(MODEL_PATH):
        print(f"Error: Vosk model not found at {MODEL_PATH}")
        return
    if not os.path.exists(SPK_MODEL_PATH):
        print(f"Error: Speaker model not found at {SPK_MODEL_PATH}")
        return

    # Create output directory if it doesn't exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Convert MP3 to WAV
    print("\nStep 1: Converting MP3 to WAV")
    wav_stream = convert_mp3_to_wav(MP3_FILE_PATH)
    if wav_stream is None:
        print("Failed to convert MP3 to WAV. Exiting.")
        return

    print(f"Converted audio length: {len(wav_stream.getvalue())} bytes")
    if len(wav_stream.getvalue()) == 0:
        print("Converted audio is empty. Check the original MP3 file.")
        return

    # Perform transcription
    print("\nStep 2: Transcribing audio and performing speaker diarization")
    transcription_results = transcribe_with_diarization(wav_stream, MODEL_PATH, SPK_MODEL_PATH)
    if transcription_results is None:
        print("Transcription failed. Exiting.")
        return

    print(f"Number of transcription results: {len(transcription_results)}")
    if transcription_results:
        print(f"First transcription result: {transcription_results[0]}")
    else:
        print("Transcription results are empty.")
        return

    # Process results
    print("\nStep 3: Processing transcription results into readable utterances")
    utterances = group_into_utterances(transcription_results)
    if utterances is None:
        print("Utterance processing failed. Exiting.")
        return

    print(f"Number of utterances: {len(utterances)}")
    if utterances:
        print(f"First utterance: {utterances[0]}")
    else:
        print("No utterances were generated.")
        print("Debug: Printing raw transcription results:")
        for i, result in enumerate(transcription_results):
            print(f"Result {i + 1}:")
            print(json.dumps(result, indent=2))
        return

    # Print and save the results
    print("\nStep 4: Saving results")
    output_text_path = os.path.join(OUTPUT_DIR, 'transcription_results.txt')
    output_json_path = os.path.join(OUTPUT_DIR, 'transcription_results.json')

    try:
        with open(output_text_path, 'w') as f:
            for utterance in utterances:
                text = " ".join(utterance['words'])
                speaker = utterance.get('speaker', 'Unknown Speaker')
                start_time = utterance['start']
                end_time = utterance['end']
                output = f"{speaker} [{start_time:.2f}s - {end_time:.2f}s]: {text}"
                print(output)
                f.write(output + '\n')
        print(f"\nResults saved to {output_text_path}")

        with open(output_json_path, 'w') as f:
            json.dump(utterances, f, indent=2)
        print(f"Results also saved in JSON format to {output_json_path}")

        print("\nTranscription and diarization process completed successfully!")
    except Exception as e:
        print(f"Error while saving results: {e}")

if __name__ == "__main__":
    main()

Attempting to mount Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive successfully mounted.

Using the following paths:
MP3 File: /content/drive/MyDrive/ISEA_Test_Audio/Test_audio.mp3
Vosk Model: /content/drive/MyDrive/ISEA_Test_Audio/vosk-model-en-us-0.22
Speaker Model: /content/drive/MyDrive/ISEA_Test_Audio/vosk-model-en-us-0.22/spk_model
Output Directory: /content/drive/MyDrive/ISEA_Test_Audio

Step 1: Converting MP3 to WAV
Converting MP3 file: /content/drive/MyDrive/ISEA_Test_Audio/Test_audio.mp3
MP3 to WAV conversion completed.
Converted audio length: 1081346 bytes

Step 2: Transcribing audio and performing speaker diarization
Initializing speech recognition model...
Starting transcription and diarization process...
Intermediate result: {'spk': [1.536778, 0.780921, 0.534482, 0.878988, -0.765632, -0.431617, 0.072102, 0.217427, 0.887421, 1.037435, 1.309376, -2.053669, -0.78386