<a href="https://colab.research.google.com/github/SingularitySmith/PRUT-Transcriber/blob/main/June16_Scribe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Table of Contents / Code Structure:

>[Table of Contents / Code Structure:](#scrollTo=r82qZenYB6b6&uniqifier=1)

>[Improved code:](#scrollTo=-W9d6E4LzxZY&uniqifier=1)

>>[Setting of Runtime Environment to GPU (e.g. "T4 GPU")](#scrollTo=BeZdV_fA_gQh&uniqifier=1)

>>[Import libraries & packages](#scrollTo=FOISPAh-_enF&uniqifier=1)

>>[Set locale to UTF-8](#scrollTo=p5Ezc1W_Ag8u&uniqifier=1)

>>[Mount Google Drive (Directories)](#scrollTo=Qp_ZD9TQ2Br6&uniqifier=1)

>>[Install Model & (WhisperX)](#scrollTo=WupiwhpJAFKQ&uniqifier=1)

>>[Configure WhisperX](#scrollTo=8DOv2Ng_ANRU&uniqifier=1)

>>[Perform Transcription & Process Files](#scrollTo=ZowUwOlCAOje&uniqifier=1)

>>[Reset/Clean the storage/environment (for successive runs)](#scrollTo=l5WNba1aAP5d&uniqifier=1)

>>[Disconnect and delete the runtime (for full reset)](#scrollTo=2hYCbMasCvy2&uniqifier=1)

>[Old Code](#scrollTo=uetET3Wd1oUD&uniqifier=1)



# Improved code:

In [None]:
!nvidia-smi


## 1.  Setting of Runtime Environment to GPU (e.g. "T4 GPU")

In [None]:
import tensorflow as tf
import torch
import torch.nn as nn
import locale
import os
import gc

# GPU Availability Check
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')

# PyTorch Device Configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device == 'cpu':
    raise SystemError('PyTorch couldn’t find GPU')

In [None]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')



import torch

# Check if CUDA (GPU support) is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model and tensors to the GPU
model.to(device)
tensor = tensor.to(device)

In [None]:
import tensorflow as tf
import torch
import torch.nn as nn

# Check for GPU availability in TensorFlow
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

# Check for GPU availability in PyTorch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model and tensor
model = SimpleModel()
tensor = torch.randn(1, 10)  # Example tensor

# Move the model and tensor to the GPU
model = model.to(device)
tensor = tensor.to(device)


Found GPU at: /device:GPU:0


## 2.  Import libraries & packages (Start here when on T4 GPU!)

In [None]:
# conda create --name whisperx python=3.10
# conda activate whisperx

# Import necessary libraries

import torch
import locale
import subprocess
# subprocess.run(["pip", "install", "git+https://github.com/m-bain/whisperx.git", "--upgrade"], check=True)
# import whisperx
from google.colab import drive
from google.colab import runtime
import os
import gc

!pip install pydub
from pydub import AudioSegment


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


## 3. Set locale to UTF-8

In [None]:
# Set locale to UTF-8 to avoid potential encoding issues
# import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8

print(locale.getpreferredencoding())

env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
UTF-8


## 4.  Mount Google Drive (Directories)

In [None]:
# Mount Google drive for file access
from google.colab import drive
drive.mount('/content/drive')

# from google.colab import drive
# drive.mount('/content/gdrive')

source_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin/Transcriptions'
target_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin/Transcriptions'

if not os.path.exists(target_directory):
    os.makedirs(target_directory)

Mounted at /content/drive


## 5.  Install Model & (WhisperX)

In [None]:
# WhisperX Installation and Configuration
try:
    import whisperx
except ImportError:
    # Consider using '!pip install' if this is a Jupyter notebook
    subprocess.run(["pip", "install", "git+https://github.com/m-bain/whisperx.git", "--upgrade"], check=True)
    import whisperx  # Try importing again after installation

# Install whisperx from its GitHub repository
#!pip install git+https://github.com/m-bain/whisperx.git --upgrade

import whisperx

# !pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git

# subprocess.run(["pip", "install", "git+https://github.com/m-bain/whisperx.git", "--upgrade"], check=True)



# Verify if installation is successful
try:
    import whisperx
    print("Whisperx installed successfully.")
except ImportError:
    print("Whisperx installation failed.")

  torchaudio.set_audio_backend("soundfile")
  torchaudio.set_audio_backend("soundfile")


Whisperx installed successfully.


## 6.  Configure WhisperX

In [None]:
# 1. Ensure GPU availability
import tensorflow as tf
import torch

# Check for GPU availability in TensorFlow and PyTorch
tf_device_name = tf.test.gpu_device_name()
if tf_device_name != '/device:GPU:0':
    raise SystemError('GPU device not found for TensorFlow')

torch_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch_device == 'cpu':
    raise SystemError('GPU device not found for PyTorch')

print(f'TensorFlow device: {tf_device_name}')
print(f'PyTorch device: {torch_device}')
.wav et
# 2. Install missing pydub library
!pip install pydub

# 3. Install WhisperX
!pip install git+https://github.com/m-bain/whisperx.git

# 4. Load the WhisperX model
import whisperx

device = "cuda"  # Using GPU
compute_type = "float32"  # For better accuracy

# Load the Whisper model
whisper_model = whisperx.load_model("large-v3", device, language='de', compute_type=compute_type)

# Load the Whisper align model
model_a, metadata = whisperx.load_align_model(language_code='de', device=device)

# Load the Whisper diarization model
# Make sure to replace 'your_auth_token_here' with the actual token you have
diarize_model = whisperx.DiarizationPipeline(use_auth_token="", device=device)

print('WhisperX models loaded successfully and ready to use on GPU.')


## 7.  Perform Transcription & Process Files

In [None]:
# Define a dictionary to map unique speaker IDs to sequential labels
speaker_labels = {}
speaker_counter = 1

# Process each audio file
for audio_file in os.listdir(source_directory):
    if audio_file.endswith('.wav'):
        try:
            audio_path = os.path.join(source_directory, audio_file)
            audio = whisperx.load_audio(audio_path)
            result = model.transcribe(audio, batch_size=batch_size)

            # Align transcription
            result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

            # Diarization
            diarize_segments = diarize_model(audio, min_speakers=5, max_speakers=9)
            result = whisperx.assign_word_speakers(diarize_segments, result)

            # Map speaker IDs to sequential labels
            for segment in result["segments"]:
                if 'speaker' in segment and segment['speaker'] not in speaker_labels:
                    speaker_labels[segment['speaker']] = f"T{speaker_counter}"
                    speaker_counter += 1

            transcription_file = os.path.join(target_directory, audio_file.replace('.wav', '.txt'))
            with open(transcription_file, 'w', encoding='utf-8') as file:
                for segment in result["segments"]:
                    speaker_label = speaker_labels.get(segment.get('speaker'), 'Unknown Speaker')
                    file.write(f"{speaker_label} [{segment['start']}-{segment['end']}]: {segment['text']}\n")

        except KeyError as e:
            print(f"Error processing {audio_file}: {e}")

        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"Out of memory error encountered with {audio_file}. Attempting to free up memory.")
                torch.cuda.empty_cache()
                gc.collect()
                continue
            else:
                raise e

        # Clear memory after processing each file
        torch.cuda.empty_cache()
        gc.collect()


Failed to align segment (" Untertitelung. BR 2018"): backtrack failed, resorting to original...


## 8.  Reset/Clean the storage/environment (for successive runs)

In [None]:
# Optional: Delete models to free up GPU resources
del model, model_a, diarize_model
torch.cuda.empty_cache()
gc.collect()


0

## 9. Compare & Merge .txt files

In [None]:
!pip install diffuse


NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [None]:
import os
from diffuse import Diff

def compare_and_merge(folder_paths, session_ids):
    for session_id in session_ids:
        contents = []
        for folder in folder_paths:
            file_path = os.path.join(folder, f'{session_id}.txt')
            with open(file_path, 'r') as file:
                contents.append(file.read())

        # Create a diff object
        diff = Diff(contents[0], contents[1])  # Compare first two runs

        # Add more versions if needed
        for content in contents[2:]:
            diff = diff.diff_new(content)

        # Print or save the diff
        print(diff)  # or save to a file

# Example usage
folder_paths = ['/path/to/1stRun', '/path/to/2ndRun', ...]  # Update with actual paths
session_ids = ['G2', 'G3','G4', 'G5','G6', 'G7', 'G8', 'G9','G10', 'G11','G12']  # Update with actual session IDs
compare_and_merge(folder_paths, session_ids)


In [None]:

import os
import difflib

def compare_and_merge(source_dir, target_dir, session_ids):
    runs = ['1stRun', '2ndRun_8at16', '3rdRun_8at16', '4thRun_8at16', '5thRun_4at32', '6thRun_6at32', '7thRun_6at32x8Speakers']
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for session_id in session_ids:
        contents = []
        max_length = 0

        # Read and store all file contents
        for run in runs:
            file_path = os.path.join(source_dir, run, f'{session_id}.txt')
            with open(file_path, 'r', encoding='utf-8') as file:
                file_content = file.readlines()
                max_length = max(max_length, len(file_content))
                contents.append(file_content)

        # Merge contents line by line
        with open(os.path.join(target_dir, f'merged_{session_id}.txt'), 'w', encoding='utf-8') as merged_file:
            for i in range(max_length):
                for content in contents:
                    if i < len(content):
                        merged_file.write(content[i].rstrip() + "\t")  # Tab-separated if needed
                merged_file.write("\n")

# Usage example
source_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin/Transcriptions'
target_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin/MergedTranscriptions'
session_ids = ['G2', 'G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9', 'G10', 'G11', 'G12']

compare_and_merge(source_directory, target_directory, session_ids)


IndentationError: expected an indented block after 'with' statement on line 2 (<ipython-input-7-a0cb9cc1db73>, line 6)

In [None]:
import os
import difflib

def compare_and_merge(source_dir, target_dir, session_ids):
    runs = ['1stRun', '2ndRun_8at16', '3rdRun_8at16', '4thRun_8at16', '5thRun_4at32', '6thRun_6at32', '7thRun_6at32x8Speakers']
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for session_id in session_ids:
        contents = []
        for run in runs:
            file_path = os.path.join(source_dir, run, f'{session_id}.txt')
            with open(file_path, 'r', encoding='utf-8') as file:  # Specify UTF-8 encoding
                contents.append(file.readlines())

        # Compare and merge content
        merged_content = contents[0]
        for other_content in contents[1:]:
            merged_content = list(difflib.unified_diff(merged_content, other_content, lineterm=''))

        # Write the merged content to a file
        with open(os.path.join(target_dir, f'merged_{session_id}.txt'), 'w', encoding='utf-8') as file:  # Specify UTF-8 encoding
            file.writelines(merged_content)

# Usage example
source_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin//Transcriptions'
target_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin/MergedTranscriptions'
session_ids = ['G2', 'G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9', 'G10', 'G11', 'G12']

compare_and_merge(source_directory, target_directory, session_ids)



In [None]:
import os

def compare_and_merge(source_dir, target_dir, session_ids):
    runs = ['1stRun', '2ndRun_8at16', '3rdRun_8at16', '4thRun_8at16', '5thRun_4at32', '6thRun_6at32', '7thRun_6at32x8Speakers']
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for session_id in session_ids:
        contents = []
        max_length = 0

        # Read and store all file contents
        for run in runs:
            file_path = os.path.join(source_dir, run, f'{session_id}.txt')
            with open(file_path, 'r', encoding='utf-8') as file:
                file_content = file.readlines()
                max_length = max(max_length, len(file_content))
                contents.append(file_content)

        # Merge contents line by line
        with open(os.path.join(target_dir, f'merged_{session_id}.txt'), 'w', encoding='utf-8') as merged_file:
            for i in range(max_length):
                for content in contents:
                    if i < len(content):
                        merged_file.write(content[i].rstrip() + "\t")  # Tab-separated if needed
                    else:
                        merged_file.write("\t")  # Append tab if the line doesn't exist in this file
                merged_file.write("\n")

# Usage example
source_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin/Transcriptions'
target_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin/MergedTranscriptions'
session_ids = ['G2', 'G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9', 'G10', 'G11', 'G12']

compare_and_merge(source_directory, target_directory, session_ids)


In [None]:
import os

def compare_and_merge(source_dir, target_dir, session_ids):
    runs = ['1stRun', '2ndRun_8at16', '3rdRun_8at16', '4thRun_8at16', '5thRun_4at32', '6thRun_6at32', '7thRun_6at32x8Speakers']
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for session_id in session_ids:
        contents = []
        max_length = 0

        # Read and store all file contents
        for run in runs:
            file_path = os.path.join(source_dir, run, f'{session_id}.txt')
            with open(file_path, 'r', encoding='utf-8') as file:
                file_content = file.readlines()
                max_length = max(max_length, len(file_content))
                contents.append(file_content)

        # Merge contents line by line
        with open(os.path.join(target_dir, f'merged_{session_id}.txt'), 'w', encoding='utf-8') as merged_file:
            for i in range(max_length):
                line_versions = [content[i].rstrip() for content in contents if i < len(content)]

                if all(line == line_versions[0] for line in line_versions):
                    # All lines are identical
                    merged_file.write(line_versions[0] + "\n")
                else:
                    # Lines are different, annotate them
                    merged_file.write(f"--- Line {i+1} Variations ---\n")
                    for j, line in enumerate(line_versions):
                        merged_file.write(f"Version {j+1}: {line}\n")
                    merged_file.write("\n")

# Usage example
source_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin/Transcriptions'
target_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin/MergedTranscriptions'
session_ids = ['G2', 'G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9', 'G10', 'G11', 'G12']

compare_and_merge(source_directory, target_directory, session_ids)


In [None]:
print(f"Reading from: {file_path}")
...
print(f"Writing to: {os.path.join(target_dir, f'merged_{session_id}.txt')}")


NameError: name 'file_path' is not defined

In [None]:
# Configure WhisperX
device = "cuda"  # Using GPU
batch_size = 4  # Adjust based on GPU memory
compute_type = "float32"  # For better accuracy

model = whisperx.load_model("large-v3", device, language='de', compute_type=compute_type)
model_a, metadata = whisperx.load_align_model(language_code='de', device=device)
diarize_model = whisperx.DiarizationPipeline(use_auth_token="", device=device)


vocabulary.json:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

100%|█████████████████████████████████████| 16.9M/16.9M [00:01<00:00, 11.6MiB/s]
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.0+cu121. Bad things might happen unless you revert torch to 1.x.


Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_voxpopuli_base_10k_asr_de.pt" to /root/.cache/torch/hub/checkpoints/wav2vec2_voxpopuli_base_10k_asr_de.pt
100%|██████████| 360M/360M [00:11<00:00, 31.9MB/s]


config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

In [None]:
drive.mount('/content/drive', force_remount=True)


ValueError: Mountpoint must not already contain files

## 9. Disconnect and delete the runtime (for full reset)

In [None]:
runtime.unassign()

NameError: name 'runtime' is not defined



---



# Old Code

In [None]:
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')


import subprocess
subprocess.run(["pip", "install", "git+https://github.com/m-bain/whisperx.git", "--upgrade"], check=True)
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8


from google.colab import drive
drive.mount('/content/drive')

import os
import whisperx
import gc

# Set directories for source audio files and target transcriptions
source_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin/AudioFilesWAV'
target_directory = '/content/drive/My Drive/MELESSA-GoogleDrive/Transcriptions-LouisLongin/Transcriptions'

# Create the target directory if it doesn't exist
if not os.path.exists(target_directory):
    os.makedirs(target_directory)

# Initialize GPU settings
device = "cuda"
batch_size = 4 # Adjust based on GPU memory
compute_type = "float32" # Use "float16" for a balance between performance and accuracy

# Initialize models
model = whisperx.load_model("large-v3", device, language='de', compute_type=compute_type)
model_a, metadata = whisperx.load_align_model(language_code='de', device=device)

# Authenticate with Hugging Face for diarization model
diarize_model = whisperx.DiarizationPipeline(use_auth_token="", device=device)

# Process each audio file
for audio_file in os.listdir(source_directory):
    if audio_file.endswith('.wav'):
        # Load and transcribe audio
        audio_path = os.path.join(source_directory, audio_file)
        audio = whisperx.load_audio(audio_path)
        result = model.transcribe(audio, batch_size=batch_size)

        # Align transcription
        result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

        # Perform diarization
        diarize_segments = diarize_model(audio, min_speakers=2, max_speakers=6)  # Adjust as needed
        result = whisperx.assign_word_speakers(diarize_segments, result)

        transcription_file = os.path.join(target_directory, audio_file.replace('.wav', '.txt'))
        with open(transcription_file, 'w', encoding='utf-8') as file:
            for segment in result["segments"]:
                speaker_label = segment.get('speaker', 'Unknown Speaker')  # Default to 'Unknown Speaker' if not found
                file.write(f"{speaker_label}: {segment['text']}\n")



        # Optional: Clear memory if needed
        gc.collect()

# Optional: Delete models to free up GPU resources
del model, model_a, diarize_model


In [None]:
import torch
print(torch.cuda.is_available())

import torch
torch.cuda.empty_cache()

True
