# Install necessary libraries (run in Colab)

In [1]:
!pip install git+https://github.com/openai/whisper.git transformers pymongo sentence-transformers pydub
!apt-get install -y ffmpeg  # For handling audio files

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-jvpkwmyq
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-jvpkwmyq
  Resolved https://github.com/openai/whisper.git to commit 271445b2f24f00f8175c4fb7ae91876f7451dfc1
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


# Drive access

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Libraries

In [3]:
import whisper
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, T5ForConditionalGeneration, T5Tokenizer
from sentence_transformers import SentenceTransformer
from pydub import AudioSegment
from pydub.utils import make_chunks
from pymongo import MongoClient
import torch
import os
import json
import math
import datetime

# Model Import and choose

In [4]:
# Load models (ensure GPU is enabled in Colab runtime settings)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Whisper model for transcription
whisper_model = whisper.load_model("large").to(device)

  checkpoint = torch.load(fp, map_location=device)


# MongoDB Operations

## MongoDB connection setup

In [None]:
from pymongo import MongoClient


def get_mongo_client(uri):
    try:
        client = MongoClient(uri)
        print("Connected to MongoDB successfully.")
        return client
    except Exception as e:
        print(f"Error connecting to MongoDB: {e}")
        return None

## MongoDB connection URI (replace with your own URI)

In [None]:
# Replace with your MongoDB URI
mongodb_uri = "mongodb+srv://python:1234567890@cluster.kvnyt.mongodb.net"
client = get_mongo_client(mongodb_uri)

Connected to MongoDB successfully.


## Database and collection setup

In [7]:
db = client['sandalquest']
collection = db['transcriptions']

## Function to save the merged transcription to MongoDB

In [None]:
def save_to_mongodb(metadata):
    try:
        collection.insert_one(metadata)
        print(f"Saved merged transcription for {
              metadata['filename']} to MongoDB.")
    except Exception as e:
        print(f"Error saving to MongoDB: {e}")

## Check if the file is already processed

In [9]:
def is_file_processed(filename):
    try:
        return collection.find_one({"filename": filename}) is not None
    except Exception as e:
        print(f"Error checking file in MongoDB: {e}")
        return False

# Function to split audio based on file size (20 MB chunks)

In [None]:
def split_audio_by_size(file_path, max_size_mb=20):
    try:
        audio = AudioSegment.from_file(file_path)
        file_size_bytes = os.path.getsize(file_path)
        max_size_bytes = max_size_mb * 1024 * 1024

        # Calculate number of chunks needed
        num_chunks = math.ceil(file_size_bytes / max_size_bytes)
        chunk_length_ms = len(audio) // num_chunks

        # Split the audio into chunks
        chunks = make_chunks(audio, chunk_length_ms)
        print(f"Split audio into {len(chunks)} chunks based on size.")
        return chunks

    except Exception as e:
        print(f"Error splitting audio by size: {e}")
        return []

# Function for Kannada transcription using Whisper

In [None]:
def transcribe_audio_kannada(audio_path):
    try:
        print("Transcribing in progress...")
        transcription = whisper_model.transcribe(
            audio_path, language="kn")['text']
        return transcription
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return ""

# Process audio files and merge transcriptions

In [None]:
def process_audio_files(directory_path):
    file_list = os.listdir(directory_path)
    process_index = 1

    for filename in file_list:
        file_path = os.path.join(directory_path, filename)

        # Skip non-audio files
        if not filename.endswith(".mp3"):
            continue

        # Check if the file is already processed
        if is_file_processed(filename):
            print(f"File {filename} already processed. Skipping.")
            continue

        try:
            print(f"Processing file {process_index}: {filename}")
            chunks = split_audio_by_size(file_path, max_size_mb=20)
            merged_transcription = ""

            # Transcribe each chunk and merge the results
            for i, chunk in enumerate(chunks):
                print(f"Transcribing chunk {
                      i + 1}/{len(chunks)} of file {filename}...")
                chunk.export("temp_chunk.wav", format="wav")
                kannada_text = transcribe_audio_kannada("temp_chunk.wav")
                merged_transcription += kannada_text + " "

            # Prepare metadata for the merged transcription
            metadata = {
                'file_index': process_index,
                'filename': filename,
                'merged_transcription': merged_transcription.strip(),
                'timestamp': datetime.datetime.utcnow(),
                'file_path': file_path,
                'file_size': os.path.getsize(file_path),
                'audio_format': filename.split(".")[-1],
                'duration_ms': len(AudioSegment.from_file(file_path))
            }

            # Save the merged transcription to MongoDB
            save_to_mongodb(metadata)
            print(f"Finished processing file {process_index}: {filename}")

            # Increment the process index for the next file
            process_index += 1

        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue

    print("All files processed or skipped successfully.")

# Step 1: Unzipping and processing audio files

In [None]:
import zipfile
import os

# Path to your dataset in Google Drive
zip_path = "/content/drive/MyDrive/SandalWoonDatasets.zip"
# Path where files will be extracted
extracted_path = "/content/drive/MyDrive/SandalWood"

# Step 1: Create the extraction folder if it doesn't exist
os.makedirs(extracted_path, exist_ok=True)

# Helper function to list files in a directory


def list_files_in_directory(directory_path):
    extracted_files = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            extracted_files.append(os.path.relpath(
                os.path.join(root, file), directory_path))
    return extracted_files


# Step 2: Check and extract files
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_files = zip_ref.namelist()  # List of all files in the ZIP archive

    # List existing files in the extracted directory
    extracted_files = list_files_in_directory(extracted_path)

    # Find missing files
    missing_files = [file for file in zip_files if file not in extracted_files]

    if missing_files:
        print(f"Missing {len(missing_files)
                         } files. Re-extracting missing files...")

        # Re-extract only missing files
        for file in missing_files:
            try:
                zip_ref.extract(file, extracted_path)
                print(f"Re-extracted file: {file}")
            except Exception as e:
                print(f"Error extracting file {file}: {e}")

    else:
        print("All files are already extracted.")

print("Dataset extraction and verification completed.")

# Verify extraction
print("Final list of extracted files:")
for file in list_files_in_directory(extracted_path):
    print(file)

# Summary
total_files_in_zip = len(zip_files)
total_extracted_files = len(list_files_in_directory(extracted_path))

if total_files_in_zip == total_extracted_files:
    print(f"All {total_files_in_zip} files are extracted successfully.")
else:
    print(f"Extraction completed with {
          total_extracted_files}/{total_files_in_zip} files available.")

All files are already extracted.
Dataset extraction and verification completed.
Final list of extracted files:
SandalWoodNewsStories_200.mp3
SandalWoodNewsStories_282.mp3
SandalWoodNewsStories_239.mp3
SandalWoodNewsStories_295.mp3
SandalWoodNewsStories_230.mp3
SandalWoodNewsStories_148.mp3
SandalWoodNewsStories_46.mp3
SandalWoodNewsStories_167.mp3
SandalWoodNewsStories_63.mp3
SandalWoodNewsStories_298.mp3
SandalWoodNewsStories_176.mp3
SandalWoodNewsStories_223.mp3
SandalWoodNewsStories_168.mp3
SandalWoodNewsStories_156.mp3
SandalWoodNewsStories_297.mp3
SandalWoodNewsStories_249.mp3
SandalWoodNewsStories_215.mp3
SandalWoodNewsStories_211.mp3
SandalWoodNewsStories_158.mp3
SandalWoodNewsStories_23.mp3
SandalWoodNewsStories_175.mp3
SandalWoodNewsStories_146.mp3
SandalWoodNewsStories_173.mp3
SandalWoodNewsStories_42.mp3
SandalWoodNewsStories_52.mp3
SandalWoodNewsStories_112.mp3
SandalWoodNewsStories_43.mp3
SandalWoodNewsStories_181.mp3
SandalWoodNewsStories_306.mp3
SandalWoodNewsStories_169

# Step 2: Process the files

In [14]:
process_audio_files(extracted_path)

File SandalWoodNewsStories_200.mp3 already processed. Skipping.
File SandalWoodNewsStories_282.mp3 already processed. Skipping.
File SandalWoodNewsStories_239.mp3 already processed. Skipping.
File SandalWoodNewsStories_295.mp3 already processed. Skipping.
File SandalWoodNewsStories_230.mp3 already processed. Skipping.
File SandalWoodNewsStories_148.mp3 already processed. Skipping.
File SandalWoodNewsStories_46.mp3 already processed. Skipping.
File SandalWoodNewsStories_167.mp3 already processed. Skipping.
File SandalWoodNewsStories_63.mp3 already processed. Skipping.
File SandalWoodNewsStories_298.mp3 already processed. Skipping.
File SandalWoodNewsStories_176.mp3 already processed. Skipping.
File SandalWoodNewsStories_223.mp3 already processed. Skipping.
File SandalWoodNewsStories_168.mp3 already processed. Skipping.
File SandalWoodNewsStories_156.mp3 already processed. Skipping.
File SandalWoodNewsStories_297.mp3 already processed. Skipping.
File SandalWoodNewsStories_249.mp3 already