In [None]:
from google.colab import drive
drive.mount('/content/drive')

MERGING AUDIO FOLDERS

In [None]:
import os
import shutil

def merge_folders(source_folders, target_folder):
    os.makedirs(target_folder, exist_ok=True)

    for folder in source_folders:
        if not os.path.exists(folder):
            print(f"Skipping non-existent folder: {folder}")
            continue

        for file_name in os.listdir(folder):
            source_path = os.path.join(folder, file_name)
            target_path = os.path.join(target_folder, file_name)

            if os.path.isfile(source_path):
                if os.path.exists(target_path):
                    base, ext = os.path.splitext(file_name)
                    counter = 1
                    new_target_path = os.path.join(target_folder, f"{base}_{counter}{ext}")

                    while os.path.exists(new_target_path):
                        counter += 1
                        new_target_path = os.path.join(target_folder, f"{base}_{counter}{ext}")

                    target_path = new_target_path

                shutil.move(source_path, target_path)
                print(f"Moved: {source_path} -> {target_path}")

def delete_json_files(folder):
    if not os.path.exists(folder):
        print(f"Folder does not exist: {folder}")
        return

    for file_name in os.listdir(folder):
        file_path = os.path.join(folder, file_name)
        if os.path.isfile(file_path) and file_name.endswith(".json"):
            os.remove(file_path)
            print(f"Deleted: {file_path}")

# Example usage
source_folders = ["/content/drive/MyDrive/Dataset/Validate/lifecycle/281474976710766"]  # Replace with actual folder paths
target_folder = "/content/drive/MyDrive/Dataset/Validate/lifecycle"
merge_folders(source_folders, target_folder)

delete_json_files(target_folder)


# EDA

In [None]:
import os
import pandas as pd

def check_missing_transcriptions(csv_path, audio_dir):
    # Load CSV file
    df = pd.read_csv(csv_path)
    existing_ids = set(df['ID'].astype(str))

    # List all .wav files in the directory
    audio_files = {os.path.splitext(f)[0] for f in os.listdir(audio_dir) if f.endswith('.wav')}

    # Check for missing transcriptions
    missing = audio_files - existing_ids

    if missing:
        print(f"Missing transcriptions for {len(missing)} files in {audio_dir}:", missing)
    else:
        print(f"All audio files in {audio_dir} have transcriptions.")


# Example usage
check_missing_transcriptions('/content/drive/MyDrive/Dataset/Train/train.csv', '/content/drive/MyDrive/Dataset/Train/translation')
check_missing_transcriptions('/content/drive/MyDrive/Dataset/Validate/validate.csv', '/content/drive/MyDrive/Dataset/Validate/lifecycle')


Missing transcriptions for 2333 files in /content/drive/MyDrive/Dataset/Train/translation: {'281474976731242', '281474976728434', '281474976730249', '281474976729997', '281474976729832', '281474976731287', '281474976728417', '281474976731310', '281474976728470', '281474976717754', '281474976730563', '281474976727360', '281474976729610', '281474976730965', '281474976730257', '281474976729734', '281474976731095', '281474976727243', '281474976728706', '281474976730979', '281474976728696', '281474976729648', '281474976727312', '281474976728565', '281474976731038', '281474976728356', '281474976730491', '281474976730372', '281474976729613', '281474976729704', '281474976728513', '281474976729059', '281474976731067', '281474976729674', '281474976728602', '281474976729804', '281474976729055', '281474976728459', '281474976729514', '281474976730415', '281474976731087', '281474976729068', '281474976730327', '281474976728767', '281474976728799', '281474976728627', '281474976728737', '28147497673126

In [None]:
import os

def count_wav_files(directory):
    return len([file for file in os.listdir(directory) if file.lower().endswith('.wav')])

# Example usage
directory_path = "/content/drive/MyDrive/Dataset/Validate/lifecycle"  # Change this to your actual directory
wav_count = count_wav_files(directory_path)
print(f"Number of WAV files: {wav_count}")


Number of WAV files: 299


In [None]:
import os
import pandas as pd

def check_missing_audio(csv_path, audio_dir):
    # Load CSV file
    df = pd.read_csv(csv_path)

    # Ensure 'ID' column exists
    if 'ID' not in df.columns:
        raise ValueError("CSV file must contain a column named 'ID'")

    # Generate expected filenames
    df['Filename'] = df['ID'].astype(str) + '.wav'

    # Check for missing files
    missing_files = [fname for fname in df['Filename'] if not os.path.exists(os.path.join(audio_dir, fname))]

    # Print missing files
    if missing_files:
        print("Missing files:")
        for file in missing_files:
            print(file)
    else:
        print("All files exist.")

    # Print count of missing files
    print(f"Total missing files: {len(missing_files)}")

    return missing_files

# Example usage
csv_path = "/content/drive/MyDrive/Dataset/Train/train.csv"  # Replace with actual CSV file path
audio_dir = "/content/drive/MyDrive/Dataset/Train/translation"  # Replace with actual audio directory path
missing_files = check_missing_audio(csv_path, audio_dir)


In [None]:
import os

# Define the directory to check
directory = "/content/drive/MyDrive/Dataset/Train/translation"  # Change this to your actual directory

# List of filenames to check
filenames = [
    "281474976733829.wav", "281474976733611.wav", "281474976735670.wav",
    "281474976734720.wav", "281474976733782.wav", "281474976734029.wav",
    "281474976734743.wav", "281474976733752.wav", "281474976733840.wav",
    "281474976735756.wav", "281474976734789.wav", "281474976733982.wav",
    "281474976734010.wav", "281474976733330.wav", "281474976734759.wav",
    "281474976734106.wav", "281474976735658.wav", "281474976734037.wav",
    "281474976734778.wav", "281474976734749.wav", "281474976734096.wav",
    "281474976733336.wav", "281474976734757.wav", "281474976734099.wav",
    "281474976733980.wav", "281474976733851.wav", "281474976733822.wav",
    "281474976733850.wav", "281474976734046.wav", "281474976734740.wav",
    "281474976733623.wav", "281474976734738.wav", "281474976733772.wav",
    "281474976733987.wav", "281474976733779.wav", "281474976733328.wav",
    "281474976735770.wav", "281474976735668.wav", "281474976733326.wav",
    "281474976734120.wav", "281474976734024.wav", "281474976734725.wav",
    "281474976734766.wav", "281474976734773.wav", "281474976735666.wav",
    "281474976733616.wav", "281474976734019.wav", "281474976733843.wav",
    "281474976734776.wav", "281474976734098.wav", "281474976733329.wav",
    "281474976734793.wav", "281474976734102.wav", "281474976734003.wav",
    "281474976735769.wav", "281474976733589.wav", "281474976733599.wav",
    "281474976734756.wav", "281474976733854.wav", "281474976734724.wav",
    "281474976735751.wav", "281474976734001.wav", "281474976734742.wav",
    "281474976733768.wav", "281474976735673.wav", "281474976734791.wav",
    "281474976735740.wav", "281474976735746.wav", "281474976733833.wav",
    "281474976733325.wav", "281474976734000.wav", "281474976734786.wav",
    "281474976733844.wav", "281474976733322.wav", "281474976735747.wav",
    "281474976733598.wav", "281474976734017.wav", "281474976735662.wav",
    "281474976733846.wav", "281474976734035.wav", "281474976735753.wav",
    "281474976735771.wav", "281474976733605.wav", "281474976733760.wav",
    "281474976733617.wav", "281474976733596.wav", "281474976734739.wav",
    "281474976734113.wav", "281474976733346.wav", "281474976733762.wav",
    "281474976735742.wav", "281474976734775.wav", "281474976734783.wav",
    "281474976733749.wav", "281474976733834.wav", "281474976733750.wav",
    "281474976733765.wav", "281474976733323.wav", "281474976733841.wav",
    "281474976734032.wav", "281474976733756.wav", "281474976733981.wav",
    "281474976733593.wav", "281474976734006.wav", "281474976733757.wav",
    "281474976734729.wav", "281474976734728.wav", "281474976735738.wav",
    "281474976734119.wav", "281474976735667.wav", "281474976734762.wav",
    "281474976734735.wav", "281474976735772.wav", "281474976734128.wav",
    "281474976735761.wav", "281474976733857.wav", "281474976734727.wav",
    "281474976733315.wav", "281474976734785.wav", "281474976733998.wav",
    "281474976734754.wav", "281474976734118.wav", "281474976733985.wav",
    "281474976734031.wav", "281474976735759.wav", "281474976733995.wav",
    "281474976735758.wav", "281474976734026.wav", "281474976734052.wav",
    "281474976734123.wav", "281474976733317.wav", "281474976734781.wav",
    "281474976733764.wav", "281474976734764.wav", "281474976734737.wav",
    "281474976733748.wav", "281474976733332.wav", "281474976733778.wav",
    "281474976733625.wav", "281474976733603.wav", "281474976733331.wav",
    "281474976735760.wav", "281474976733825.wav", "281474976735762.wav",
    "281474976733590.wav", "281474976733830.wav", "281474976734753.wav",
    "281474976734748.wav", "281474976733860.wav", "281474976733755.wav",
    "281474976734007.wav", "281474976734745.wav", "281474976733770.wav",
    "281474976735657.wav", "281474976734015.wav", "281474976734765.wav",
    "281474976733350.wav", "281474976735754.wav", "281474976733333.wav",
    "281474976733318.wav", "281474976733345.wav", "281474976734002.wav",
    "281474976733613.wav", "281474976734787.wav", "281474976735674.wav",
    "281474976734009.wav", "281474976734011.wav", "281474976734116.wav",
    "281474976733855.wav", "281474976733853.wav", "281474976733594.wav",
    "281474976734110.wav", "281474976734117.wav", "281474976733321.wav",
    "281474976733347.wav", "281474976733343.wav", "281474976733993.wav",
    "281474976733609.wav", "281474976735739.wav", "281474976733607.wav",
    "281474976734045.wav", "281474976734722.wav", "281474976733984.wav",
    "281474976734103.wav", "281474976733773.wav", "281474976733978.wav",
    "281474976733327.wav", "281474976733761.wav", "281474976735749.wav",
    "281474976734025.wav", "281474976733826.wav", "281474976733320.wav"
]

# Check if files exist and print missing ones
missing_files = [filename for filename in filenames if not os.path.exists(os.path.join(directory, filename))]

# Print missing files
if missing_files:
    print("Missing files:")
    for file in missing_files:
        print(file)
else:
    print("All files exist.")


All files exist.


In [None]:
import os
import shutil

# Define source and destination directories
source_dir = "/content/drive/MyDrive/Dataset/Validate/lifecycle"  # Change this to your source directory
destination_dir = "/content/drive/MyDrive/Dataset//Train/translation"  # Change this to your destination directory
missing_files_log = "missing_files.txt"

# List of filenames to move
filenames = [
    "281474976733829.wav", "281474976733611.wav", "281474976735670.wav",
    "281474976734720.wav", "281474976733782.wav", "281474976734029.wav",
    "281474976734743.wav", "281474976733752.wav", "281474976733840.wav",
    "281474976735756.wav", "281474976734789.wav", "281474976733982.wav",
    "281474976734010.wav", "281474976733330.wav", "281474976734759.wav",
    "281474976734106.wav", "281474976735658.wav", "281474976734037.wav",
    "281474976734778.wav", "281474976734749.wav", "281474976734096.wav",
    "281474976733336.wav", "281474976734757.wav", "281474976734099.wav",
    "281474976733980.wav", "281474976733851.wav", "281474976733822.wav",
    "281474976733850.wav", "281474976734046.wav", "281474976734740.wav",
    "281474976733623.wav", "281474976734738.wav", "281474976733772.wav",
    "281474976733987.wav", "281474976733779.wav", "281474976733328.wav",
    "281474976735770.wav", "281474976735668.wav", "281474976733326.wav",
    "281474976734120.wav", "281474976734024.wav", "281474976734725.wav",
    "281474976734766.wav", "281474976734773.wav", "281474976735666.wav",
    "281474976733616.wav", "281474976734019.wav", "281474976733843.wav",
    "281474976734776.wav", "281474976734098.wav", "281474976733329.wav",
    "281474976734793.wav", "281474976734102.wav", "281474976734003.wav",
    "281474976735769.wav", "281474976733589.wav", "281474976733599.wav",
    "281474976734756.wav", "281474976733854.wav", "281474976734724.wav",
    "281474976735751.wav", "281474976734001.wav", "281474976734742.wav",
    "281474976733768.wav", "281474976735673.wav", "281474976734791.wav",
    "281474976735740.wav", "281474976735746.wav", "281474976733833.wav",
    "281474976733325.wav", "281474976734000.wav", "281474976734786.wav",
    "281474976733844.wav", "281474976733322.wav", "281474976735747.wav",
    "281474976733598.wav", "281474976734017.wav", "281474976735662.wav",
    "281474976733846.wav", "281474976734035.wav", "281474976735753.wav",
    "281474976735771.wav", "281474976733605.wav", "281474976733760.wav",
    "281474976733617.wav", "281474976733596.wav", "281474976734739.wav",
    "281474976734113.wav", "281474976733346.wav", "281474976733762.wav",
    "281474976735742.wav", "281474976734775.wav", "281474976734783.wav",
    "281474976733749.wav", "281474976733834.wav", "281474976733750.wav",
    "281474976733765.wav", "281474976733323.wav", "281474976733841.wav",
    "281474976734032.wav", "281474976733756.wav", "281474976733981.wav",
    "281474976733593.wav", "281474976734006.wav", "281474976733757.wav",
    "281474976734729.wav", "281474976734728.wav", "281474976735738.wav",
    "281474976734119.wav", "281474976735667.wav", "281474976734762.wav",
    "281474976734735.wav", "281474976735772.wav", "281474976734128.wav",
    "281474976735761.wav", "281474976733857.wav", "281474976734727.wav",
    "281474976733315.wav", "281474976734785.wav", "281474976733998.wav",
    "281474976734754.wav", "281474976734118.wav", "281474976733985.wav",
    "281474976734031.wav", "281474976735759.wav", "281474976733995.wav",
    "281474976735758.wav", "281474976734026.wav", "281474976734052.wav",
    "281474976734123.wav", "281474976733317.wav", "281474976734781.wav",
    "281474976733764.wav", "281474976734764.wav", "281474976734737.wav",
    "281474976733748.wav", "281474976733332.wav", "281474976733778.wav",
    "281474976733625.wav", "281474976733603.wav", "281474976733331.wav",
    "281474976735760.wav", "281474976733825.wav", "281474976735762.wav",
    "281474976733590.wav", "281474976733830.wav", "281474976734753.wav",
    "281474976734748.wav", "281474976733860.wav", "281474976733755.wav",
    "281474976734007.wav", "281474976734745.wav", "281474976733770.wav",
    "281474976735657.wav", "281474976734015.wav", "281474976734765.wav",
    "281474976733350.wav", "281474976735754.wav", "281474976733333.wav",
    "281474976733318.wav", "281474976733345.wav", "281474976734002.wav",
    "281474976733613.wav", "281474976734787.wav", "281474976735674.wav",
    "281474976734009.wav", "281474976734011.wav", "281474976734116.wav",
    "281474976733855.wav", "281474976733853.wav", "281474976733594.wav",
    "281474976734110.wav", "281474976734117.wav", "281474976733321.wav",
    "281474976733347.wav", "281474976733343.wav", "281474976733993.wav",
    "281474976733609.wav", "281474976735739.wav", "281474976733607.wav",
    "281474976734045.wav", "281474976734722.wav", "281474976733984.wav",
    "281474976734103.wav", "281474976733773.wav", "281474976733978.wav",
    "281474976733327.wav", "281474976733761.wav", "281474976735749.wav",
    "281474976734025.wav", "281474976733826.wav", "281474976733320.wav"
]

# The rest of the script remains unchanged...
missing_files = []

# Ensure the destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Move files and log missing ones
for filename in filenames:
    source_path = os.path.join(source_dir, filename)
    destination_path = os.path.join(destination_dir, filename)

    if os.path.exists(source_path):
        shutil.move(source_path, destination_path)
        print(f"Moved: {filename}")
    else:
        print(f"Missing: {filename}")
        missing_files.append(filename)

# Write missing files to a log file
if missing_files:
    with open(missing_files_log, "w") as f:
        for file in missing_files:
            f.write(file + "\n")

    print(f"Missing files list saved to {missing_files_log}")

print("File transfer complete.")

In [None]:
import os
import shutil

# Define source and destination directories
source_dir = "/content/drive/MyDrive/Dataset/Validate/lifecycle"  # Change this to your source directory
destination_dir = "/content/drive/MyDrive/Dataset/Train/translation"  # Change this to your destination directory
missing_files_log = "missing_files.txt"

# List of filenames to move
filenames = [
    "281474976734020.wav", "281474976733776.wav", "281474976735671.wav", "281474976734109.wav", "281474976733614.wav", "281474976734030.wav", "281474976735765.wav", "281474976734129.wav", "281474976735737.wav", "281474976733824.wav", "281474976733852.wav", "281474976733339.wav", "281474976733334.wav", "281474976733338.wav", "281474976733836.wav", "281474976733753.wav", "281474976734792.wav", "281474976733620.wav", "281474976735669.wav", "281474976733777.wav", "281474976735672.wav", "281474976734036.wav", "281474976734005.wav", "281474976733626.wav", "281474976733353.wav", "281474976733612.wav", "281474976733747.wav", "281474976733600.wav", "281474976734796.wav", "281474976734004.wav", "281474976734771.wav", "281474976733989.wav", "281474976733845.wav", "281474976734751.wav", "281474976734732.wav", "281474976734111.wav", "281474976733990.wav", "281474976733618.wav", "281474976733983.wav", "281474976734016.wav", "281474976735745.wav", "281474976733758.wav", "281474976734105.wav", "281474976734126.wav", "281474976733759.wav", "281474976734042.wav", "281474976733591.wav", "281474976733604.wav", "281474976734013.wav", "281474976734131.wav", "281474976735741.wav", "281474976734723.wav", "281474976734752.wav", "281474976734022.wav", "281474976733839.wav", "281474976734115.wav", "281474976733754.wav", "281474976733351.wav", "281474976734034.wav", "281474976733602.wav", "281474976733781.wav", "281474976734130.wav", "281474976733341.wav", "281474976734782.wav", "281474976734761.wav", "281474976733597.wav", "281474976735748.wav", "281474976734108.wav", "281474976733622.wav", "281474976733621.wav", "281474976734012.wav", "281474976733994.wav", "281474976734041.wav", "281474976734124.wav", "281474976734741.wav", "281474976733842.wav", "281474976734734.wav", "281474976734047.wav", "281474976734122.wav", "281474976734772.wav", "281474976734014.wav", "281474976735661.wav", "281474976733838.wav", "281474976734038.wav", "281474976735660.wav", "281474976733746.wav", "281474976733608.wav", "281474976733610.wav", "281474976735757.wav", "281474976734021.wav", "281474976733988.wav", "281474976734770.wav", "281474976733352.wav", "281474976733766.wav", "281474976733828.wav", "281474976734055.wav", "281474976735763.wav", "281474976735744.wav", "281474976733849.wav", "281474976734719.wav", "281474976733859.wav", "281474976734795.wav", "281474976735755.wav", "281474976734049.wav", "281474976733997.wav", "281474976735766.wav", "281474976733999.wav", "281474976734750.wav", "281474976734790.wav", "281474976735768.wav", "281474976735767.wav", "281474976735736.wav", "281474976733342.wav", "281474976733601.wav", "281474976734125.wav", "281474976734008.wav", "281474976734121.wav", "281474976733595.wav", "281474976734023.wav", "281474976734744.wav", "281474976733335.wav", "281474976733992.wav", "281474976734747.wav", "281474976733624.wav", "281474976733856.wav", "281474976734127.wav", "281474976735743.wav", "281474976734733.wav"
]

missing_files = []

# Ensure the destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Move files and log missing ones
for filename in filenames:
    source_path = os.path.join(source_dir, filename)
    destination_path = os.path.join(destination_dir, filename)

    if os.path.exists(source_path):
        shutil.move(source_path, destination_path)
        print(f"Moved: {filename}")
    else:
        print(f"Missing: {filename}")
        missing_files.append(filename)

# Write missing files to a log file
if missing_files:
    with open(missing_files_log, "w") as f:
        for file in missing_files:
            f.write(file + "\n")

    print(f"Missing files list saved to {missing_files_log}")

print("File transfer complete.")


In [None]:
import os
import pandas as pd

def check_and_delete_missing_transcriptions(csv_path, audio_dir):
    # Load CSV file
    df = pd.read_csv(csv_path)
    existing_ids = set(df['ID'].astype(str))

    # List all .wav files in the directory
    audio_files = {os.path.splitext(f)[0] for f in os.listdir(audio_dir) if f.endswith('.wav')}

    # Identify missing transcriptions
    missing = audio_files - existing_ids
    missing_count = len(missing)

    if missing:
        print(f"Found {missing_count} audio files without transcriptions in {audio_dir}. Deleting them now...")
        for missing_file in missing:
            file_path = os.path.join(audio_dir, f"{missing_file}.wav")
            try:
                os.remove(file_path)
                print(f"Deleted: {file_path}")
            except Exception as e:
                print(f"Error deleting {file_path}: {e}")
        print(f"Total {missing_count} files deleted from {audio_dir}.")
    else:
        print(f"All {len(audio_files)} audio files in {audio_dir} have transcriptions.")

# Example usage
check_and_delete_missing_transcriptions('/content/drive/MyDrive/Dataset/Train/train.csv', '/content/drive/MyDrive/Dataset/Train/translation')
check_and_delete_missing_transcriptions('/content/drive/MyDrive/Dataset/Validate/validate.csv', '/content/drive/MyDrive/Dataset/Validate/lifecycle')


DEVNAGARI TO IPA

In [None]:
!python -m allosaurus.bin.list_phone --lang hin

a aː b bː b̤ b̤ː d̠ d̤ d̪ d̪ː d̪̤ d̪̤ː d̻ e eː e̯ f fː h i iː j jː k kʰ kʰː kː l lː l̤ l̪ l̪̤ l̻ m mː m̤ n nː n̪ n̪ː n̪̤ n̻ o oː o̯ p pʰ pʰː pː q qː r rː r̪ s sː s̪ t̠ t̪ t̪ʰ t̪ʰː t̪ː t̻ u uː w z zː z̪ æ æː ŋ ŋː ɑ ɑ̃ ɔ ɔː ɔ̃ ɔ̃ː ɔ̤ ɖ ɖː ɖ̤ ɖ̤ː ə ə̃ ɛ ɛː ɛ̃ ɛ̃ː ɜ ɜ̃ ɟ ɟ̤ ɡ ɡː ɡ̤ ɡ̤ː ɦ ɪ ɪ̃ ɪ̈ ɲ ɳ ɽ ɽ̃ ɽ̤ ɾ ʁ ʂ ʃ ʃʰ ʃʰː ʃː ʈ ʈʰ ʈʰː ʈː ʊ ʊ̃ ʋ ʋː ʒ ʒː ʒ̤ ʒ̤ː ʔ ʝ β̞ χ


In [None]:
!pip install indic-transliteration



In [None]:
with open("/content/drive/MyDrive/Dataset/train/train.csv", encoding="utf-8") as f:
    for i in range(5):
        print(f.readline())


ID,IPA_Transcription

281474976729909,ʈ iː ʈ oː k ɛː s ɛː ɡ ɪ ɾ oː

281474976729966,m ɛː ɦ ə ɾ ə d̪ ɪ n aː ə l t̪ ə ə l t̪ ə ɦ ɛː ɾ aː n ə ɦ ɛː ɡ ə uː

281474976717958,t̪ ʊ m ə k ə l l ɪ b ə eː ɛː k ɪ t̪ aː b ə d̪ ɛː d̪ ɛː ɡ eː

281474976730085,m ɛː b aː eː ɛː eː k ə b ə aː d̪ ɛː ɾ ə uː



In [None]:
import pandas as pd
from indic_transliteration.sanscript import transliterate, DEVANAGARI, IAST
import re

# Hindi phoneme inventory (simplified for filtering)
hindi_inventory = set([
    "a", "aː", "b", "bː", "b̤", "b̤ː", "d̪", "d̪ː", "d̪ʱ", "d̪ʱː", "e", "eː", "ɛ", "ɛː", "f", "ɡ", "ɡː", "ɡ̤", "ɦ", "i", "iː",
    "j", "jː", "k", "kʰ", "kʰː", "l", "lː", "m", "mː", "n", "nː", "n̪", "n̪ː", "o", "oː", "p", "pʰ", "pʰː", "ɾ", "ɽ",
    "s", "sː", "t̪", "t̪ː", "t̪ʰ", "t̪ʰː", "u", "uː", "ʋ", "z", "ʃ", "ʈ", "ʈʰ", "ʈː", "ɖ", "ɖː", "ɖʱ", "ɖʱː", "ŋ", "ɲ", "ɳ",
    "ə", "ʊ", "ɔ", "ɔː", "ɪ", "ɟ", "ʒ", "ʂ", "æ"
])

# Basic IAST to IPA mapping
iast_to_ipa = {
    "ai": "ɛː", "au": "ɔː",
    "a": "ə", "ā": "aː", "i": "ɪ", "ī": "iː", "u": "ʊ", "ū": "uː",
    "e": "eː", "o": "oː",
    "k": "k", "kh": "kʰ", "g": "ɡ", "gh": "ɡ̤", "ṅ": "ŋ",
    "c": "ʈʃ", "ch": "ʈʃʰ", "j": "ɟ", "jh": "ɟʱ", "ñ": "ɲ",
    "ṭ": "ʈ", "ṭh": "ʈʰ", "ḍ": "ɖ", "ḍh": "ɖʱ", "ṇ": "ɳ",
    "t": "t̪", "th": "t̪ʰ", "d": "d̪", "dh": "d̪ʱ", "n": "n",
    "p": "p", "ph": "pʰ", "b": "b", "bh": "bʱ", "m": "m",
    "y": "j", "r": "ɾ", "l": "l", "v": "ʋ",
    "ś": "ʃ", "ṣ": "ʂ", "s": "s", "h": "ɦ",
    "ṁ": "ŋ", "ḥ": "ɦ"
}

# Convert Devanagari to space-separated IPA phonemes
def convert_to_ipa(dev_text):
    # Devanagari to IAST
    iast = transliterate(dev_text, DEVANAGARI, IAST)

    # Handle multi-letter mappings first
    for key in sorted(iast_to_ipa, key=len, reverse=True):
        iast = iast.replace(key, f" {iast_to_ipa[key]} ")

    # Remove extra spaces
    iast = re.sub(r"\s+", " ", iast).strip()

    # Keep only phonemes in Hindi inventory
    phonemes = [p for p in iast.split() if p in hindi_inventory]

    return " ".join(phonemes)

# Load and clean the file manually
with open("/content/drive/MyDrive/validate.csv", encoding="utf-8") as f:
    lines = f.readlines()

# Clean and parse into a list of (id, transcription)
rows = []
for line in lines[1:]:  # skip header
    parts = line.strip().strip(",").split(",")
    if len(parts) >= 2:
        id_str = parts[0].strip()
        text = ",".join(parts[1:]).strip(" ,\n")
        if id_str.isdigit() and text:
            rows.append((id_str, text))

print(f"✅ Loaded {len(rows)} valid rows.")

# Create DataFrame
df = pd.DataFrame(rows, columns=["ID", "Transcription"])

# Apply conversion
df["IPA_Transcription"] = df["Transcription"].apply(convert_to_ipa)

# Save output
output_path = "/content/drive/MyDrive/validate_IPA.csv"
df[["ID", "IPA_Transcription"]].to_csv(output_path, index=False, sep="\t")
print(f"✅ IPA file saved at {output_path}")


✅ Loaded 291 valid rows.
✅ IPA file saved at /content/drive/MyDrive/validate_IPA.csv


## **generating text and wave files:**

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Dataset/train/train.csv", sep=",", encoding="utf-8")
print("Columns in CSV:", df.columns.tolist())


Columns in CSV: ['ID', 'IPA_Transcription']


In [2]:
!pip install allosaurus

Collecting allosaurus
  Downloading allosaurus-1.0.2-py3-none-any.whl.metadata (400 bytes)
Collecting resampy (from allosaurus)
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Collecting panphon (from allosaurus)
  Downloading panphon-0.21.2-py2.py3-none-any.whl.metadata (15 kB)
Collecting unicodecsv (from panphon->allosaurus)
  Downloading unicodecsv-0.14.1.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting munkres (from panphon->allosaurus)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->allosaurus)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->allosaurus)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->allosaurus)
  Downloading nvidia_cuda_cupti_c

In [None]:
!pip show allosaurus

Name: allosaurus
Version: 1.0.2
Summary: a multilingual phone recognizer
Home-page: https://github.com/xinjli/allosaurus
Author: Xinjian Li
Author-email: xinjianl@cs.cmu.edu
License: UNKNOWN
Location: /usr/local/lib/python3.11/dist-packages
Requires: editdistance, numpy, panphon, resampy, scipy, torch
Required-by: 


In [None]:
!python -m allosaurus.bin.download_model -m uni2005

downloading model  uni2510
from:  https://github.com/xinjli/allosaurus/releases/download/v1.0/uni2510.tar.gz
to:    /usr/local/lib/python3.11/dist-packages/allosaurus/pretrained
please wait...
Error: could not download the model
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/allosaurus/bin/download_model.py", line 25, in download_model
    resp = urlopen(url)
           ^^^^^^^^^^^^
  File "/usr/lib/python3.11/urllib/request.py", line 216, in urlopen
    return opener.open(url, data, timeout)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/urllib/request.py", line 525, in open
    response = meth(req, response)
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/urllib/request.py", line 634, in http_response
    response = self.parent.error(
               ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/urllib/request.py", line 563, in error
    return self._call_chain(*args)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File 

In [None]:
import os
import pandas as pd

def create_data_files(data_dir, split):
    """
    Generates wave and text files from CSV transcription data.

    Args:
    - data_dir (str): Path to the dataset directory.
    - split (str): Either 'train' or 'validate' (subdirectory names).
    """
    audio_dir = os.path.join(data_dir, split, "audio")  # Path to audio folder
    csv_path = os.path.join(data_dir, split, f"{split}.csv")  # CSV file path
    wave_file = os.path.join(data_dir, split, "wave")  # Output wave file
    text_file = os.path.join(data_dir, split, "text")  # Output text file

    # Load the CSV file
    # Try reading as comma-separated
    df = pd.read_csv(csv_path, encoding="utf-8")
    if len(df.columns) == 1 and '\t' in df.columns[0]:
        # Fallback to tab-separated if only one column is detected
        df = pd.read_csv(csv_path, sep='\t', encoding="utf-8")

    # Strip column names and confirm
    df.columns = df.columns.str.strip()
    print(f"Columns after strip in {split}: {df.columns.tolist()}")


    with open(wave_file, "w", encoding="utf-8") as wf, open(text_file, "w", encoding="utf-8") as tf:
        for _, row in df.iterrows():
            utt_id = row["ID"]
            transcription = row["IPA_Transcription"]
            audio_path = os.path.join(audio_dir, f"{utt_id}.wav")

            # Ensure the audio file exists
            if not os.path.exists(audio_path):
                print(f"Warning: Audio file {audio_path} not found. Skipping...")
                continue

            # Write to wave file (mapping utt_id to audio path)
            wf.write(f"{utt_id} {audio_path}\n")

            # Write to text file (mapping utt_id to transcription)
            tf.write(f"{utt_id} {transcription}\n")

    print(f"Successfully created 'wave' and 'text' files in {split}/")

# Define dataset directory
dataset_path = "/content/drive/MyDrive/Dataset"  # Change this to your actual dataset path

# Generate files for train and validate sets
create_data_files(dataset_path, "train")
create_data_files(dataset_path, "validate")


Columns after strip in train: ['ID', 'IPA_Transcription']
Successfully created 'wave' and 'text' files in train/
Columns after strip in validate: ['ID', 'IPA_Transcription']
Successfully created 'wave' and 'text' files in validate/


EXTRACTING AUDIO FEATURES

In [None]:
!python -m allosaurus.bin.prep_feat --model=uni2005 --path=/content/drive/MyDrive/Dataset/validate

100% 291/291 [00:17<00:00, 16.74it/s]


EXTRACTING TEXT FEATURES

In [None]:
# command to prepare token
!python -m allosaurus.bin.prep_token --model=uni2005 --lang=hin --path=/content/drive/MyDrive/Dataset/train

  0% 0/1162 [00:00<?, ?it/s]100% 1162/1162 [00:00<00:00, 34411.83it/s]


In [None]:
# command to fine_tune your data
!python -m allosaurus.bin.adapt_model --pretrained_model=uni2005 --new_model=uni2510 --path="/content/drive/MyDrive/Dataset_allosaurus" --lang=hin --device_id=0 --epoch=20

epoch[batch]: 00[0000] | train loss 5.59926 train per 0.85262
epoch[batch]: 00[0010] | train loss 4.90026 train per 0.81228
epoch[batch]: 00[0020] | train loss 4.00996 train per 0.74589
epoch[batch]: 00[0030] | train loss 3.47739 train per 0.71789
epoch[batch]: 00[0040] | train loss 3.08052 train per 0.68517
epoch[batch]: 00[0050] | train loss 2.62968 train per 0.61481
epoch0 | validate per : 0.59809
saving model
epoch[batch]: 01[0000] | train loss 2.46676 train per 0.59928
epoch[batch]: 01[0010] | train loss 2.42195 train per 0.57673
epoch[batch]: 01[0020] | train loss 2.30216 train per 0.55101
epoch[batch]: 01[0030] | train loss 2.16647 train per 0.52643
epoch[batch]: 01[0040] | train loss 2.06605 train per 0.50643
epoch[batch]: 01[0050] | train loss 1.95873 train per 0.49510
epoch1 | validate per : 0.49004
saving model
epoch[batch]: 02[0000] | train loss 1.95601 train per 0.50157
epoch[batch]: 02[0010] | train loss 1.84144 train per 0.47634
epoch[batch]: 02[0020] | train loss 1.7510

In [None]:
!find / -type d -name "uni2510" 2>/dev/null


/usr/local/lib/python3.11/dist-packages/allosaurus/pretrained/uni2510


In [None]:
!cp -r /usr/local/lib/python3.11/dist-packages/allosaurus/pretrained/uni2510 /content/drive/MyDrive/uni2510


In [None]:
# command to check all your models
!python -m allosaurus.bin.list_model

Available Models
- uni2510 (default)
- uni2005


In [None]:
!python -m allosaurus.run --lang hin --model uni2510 --device_id 0 -i "/content/drive/MyDrive/Dataset_allosaurus/validate/audio/281474976714320.wav"

m eː s ə d̪ ə d̪ ʊ eː ɛː ɾ ə eː ɛː


ROUGH

In [None]:
import os
import subprocess
import csv

# Set your parameters
input_directory = "/content/drive/MyDrive/Dataset_allosaurus/validate/audio"
lang = "hin"
model = "uni2510"
device_id = "0"

# Output CSV file path
output_csv = "/content/drive/MyDrive/Dataset_allosaurus/validate/transcriptions.csv"

# Make a list of all WAV files
wav_files = [f for f in os.listdir(input_directory) if f.endswith(".wav")]
total_files = len(wav_files)

print(f"Found {total_files} WAV files.\n")

# Create a new CSV file and write the header
with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["ID", "Transcription"])  # Write header

    # Process each file
    for idx, filename in enumerate(wav_files, start=1):
        filepath = os.path.join(input_directory, filename)

        # Build the command
        command = [
            "python", "-m", "allosaurus.run",
            "--lang", lang,
            "--model", model,
            "--device_id", device_id,
            "-i", filepath
        ]

        # Run Allosaurus and capture output
        result = subprocess.run(command, capture_output=True, text=True)
        transcription = result.stdout.strip()

        # Write ID and transcription to CSV immediately
        writer.writerow([filename, transcription])

        # Print progress and transcription
        print(f"[{idx}/{total_files}] Processed: {filename}")
        print("Transcription:")
        print(transcription)
        print("-" * 50)

print(f"\n✅ All transcriptions saved to: {output_csv}")


Found 299 WAV files.

[1/299] Processed: 281474976735665.wav
Transcription:
b ə aː k ɔː n aː m ə p ə ɖ d̪ ə ɟ iː d̪ t̪ eː ɛː aː ə b ə n ə b ɡ ɛː ɾ aː ɦ ɔː k oː t̪ ʊ eː ɛː k oː n ə l iː ɡ ʊ n d̪ ə l iː p ə t̪ ʊ aː b ə k ɪ ɾ aː d̪ eː k eː b ə aː k ə n aː m ə t̪ ə p ɔː ɟ aː t̪ ʊ eː ɛː
--------------------------------------------------
[2/299] Processed: 281474976735664.wav
Transcription:
n aː m ɔː ɾ ə k ɪ ə m k ɪ k aː ɾ ɪ k ɾ ə m ə m ɪ ɾ ɔː l iː m ɔː l iː aː b ə l ɟ ə n uː ɡ ɔː b ʊ l aː l ə aː d̪ ɪ k aː p iː oː k iː aː ɟ aː t̪ ʊ eː ɛː
--------------------------------------------------
[3/299] Processed: 281474976735659.wav
Transcription:
b ə p ɛː d̪ ə ɦ ɔː n eː k ɪ b aː ɾ ɪ t̪ iː k iː ɟ aː t̪ eː ɛː s ə m ɪ d̪ aː b ə t̪ ə k iː t̪ ɪ p ɪ ɟ ə s ə m ɛː k eː ɟ ɪ t̪ n eː b iː l oː ɡ oː t̪ ə ɟ eː s ɪ p ə s ʊ ɟ ɪ s k eː b aː ə ɟ ɪ t̪ ə n aː ə p ɛː s aː b ʊ s ə k j ə aː n ə s aː ɾ ə ɦ oː l oː ɡ ə b aː b ə t̪ ə k ə ɾ ə p eː ɔː ɾ ə l oː ɡ uː k ə n ɪ m ə n t̪ ə ɾ ɪ s k ə ɾ k eː aː b ə t̪ ə k aː b aː t

In [None]:
import pandas as pd

# Step 1: Load the CSV file
file_path = '/content/drive/MyDrive/transcriptions.csv'  # <-- change this to your file path
df = pd.read_csv(file_path)

# Step 2: Modify the 'ID' column to remove ".wav"
df['ID'] = df['ID'].str.replace('.wav', '', regex=False)

# Step 3: Save the modified DataFrame back to CSV
output_path = '/content/drive/MyDrive/transcriptions.csv'  # <-- change this if you want a different output name
df.to_csv(output_path, index=False)

print(f"File saved as {output_path}")


File saved as /content/drive/MyDrive/transcriptions.csv


In [None]:
import pandas as pd

# Load the two CSV files (ASR and Ground Truth in IPA format)
# Load the two CSV files (ASR and Ground Truth in IPA format)
asr_df = pd.read_csv('/content/drive/MyDrive/transcriptions.csv')
gt_df = pd.read_csv('/content/drive/MyDrive/validate_IPA.csv', delimiter='\t')

# Check the column names to ensure they are loaded correctly
print("ASR DataFrame columns:", asr_df.columns)
print("Ground Truth DataFrame columns:", gt_df.columns)

# Rename Ground Truth columns to match ASR DataFrame column names
gt_df.columns = ['ID', 'Transcription']

# Now merge on 'ID'
merged_df = pd.merge(gt_df, asr_df, on='ID', suffixes=('_gt', '_asr'))

# Preprocessing: lowercasing and removing punctuations (optional, depends on what you want)
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

merged_df['Transcription_gt_clean'] = merged_df['Transcription_gt'].apply(preprocess)
merged_df['Transcription_asr_clean'] = merged_df['Transcription_asr'].apply(preprocess)

# Function to compare two sentences
def analyze_differences(gt, asr):
    gt_words = gt.split()
    asr_words = asr.split()

    matcher = difflib.SequenceMatcher(None, gt_words, asr_words)

    substitutions = []
    insertions = []
    deletions = []

    for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
        if opcode == 'replace':
            substitutions.append((gt_words[i1:i2], asr_words[j1:j2]))
        elif opcode == 'insert':
            insertions.append(asr_words[j1:j2])
        elif opcode == 'delete':
            deletions.append(gt_words[i1:i2])

    return substitutions, insertions, deletions

# Apply to all rows
error_summary = []
for idx, row in merged_df.iterrows():
    subs, ins, dels = analyze_differences(row['Transcription_gt_clean'], row['Transcription_asr_clean'])
    error_summary.append({
        'ID': row['ID'],
        'Substitutions': subs,
        'Insertions': ins,
        'Deletions': dels
    })

error_df = pd.DataFrame(error_summary)

# Count error types overall
all_subs = []
all_ins = []
all_dels = []

for errors in error_summary:
    all_subs.extend(errors['Substitutions'])
    all_ins.extend(errors['Insertions'])
    all_dels.extend(errors['Deletions'])

substitution_counter = Counter([f"{' '.join(gt)} --> {' '.join(asr)}" for gt, asr in all_subs])
insertion_counter = Counter([phoneme for ins in all_ins for phoneme in ins])
deletion_counter = Counter([phoneme for dels in all_dels for phoneme in dels])

# Print some basic analysis
print("\nTop Substitutions:")
for pair, count in substitution_counter.most_common(10):
    print(f"{pair}: {count} times")

print("\nTop Insertions (ASR added extra phonemes):")
for phoneme, count in insertion_counter.most_common(10):
    print(f"{phoneme}: {count} times")

print("\nTop Deletions (ASR missed phonemes):")
for phoneme, count in deletion_counter.most_common(10):
    print(f"{phoneme}: {count} times")

# Save detailed error report
error_df.to_csv('error_analysis_report_ipa.csv', index=False)
print("\nDetailed error report saved as 'error_analysis_report_ipa.csv'")



ASR DataFrame columns: Index(['ID', 'Transcription'], dtype='object')
Ground Truth DataFrame columns: Index(['ID', 'IPA_Transcription'], dtype='object')

Top Substitutions:
ɪ --> ə: 24 times
ə --> ɪ: 19 times
oː --> ɔː: 18 times
eː --> ɪ: 17 times
ɦ --> eː: 15 times
ɪ --> eː: 13 times
ɔː --> ə: 12 times
aː --> ə: 12 times
uː --> ʊ: 10 times
ɛː --> ə: 10 times

Top Insertions (ASR added extra phonemes):
ə: 263 times
ɪ: 115 times
aː: 110 times
eː: 96 times
k: 90 times
ɛː: 74 times
t: 71 times
ɾ: 68 times
b: 57 times
n: 47 times

Top Deletions (ASR missed phonemes):
ə: 246 times
eː: 101 times
ɪ: 93 times
ɾ: 72 times
ɛː: 69 times
t: 63 times
ɦ: 58 times
aː: 52 times
k: 45 times
b: 43 times

Detailed error report saved as 'error_analysis_report_ipa.csv'


In [None]:
import pandas as pd
import difflib
from collections import Counter
import re

# Homophone dictionary for IPA phonemes (extend this as necessary)
ipa_homophones = {
    'eː': ['ɛː', 'ɛ'],
    'oː': ['ɔː'],
    'ʈʃ': ['ʈʂ'],
    'ʈʂ': ['ʈʃ'],
    'kʰ': ['k'],
    'g': ['ɡ'],
    'b': ['bː'],
    'd̪ʰ': ['d̪']
}

# Function to check if two IPA phonemes are homophones
def are_homophones(ipa1, ipa2):
    for key, values in ipa_homophones.items():
        if ipa1 == key and ipa2 in values:
            return True
        if ipa2 == key and ipa1 in values:
            return True
    return False

# Load the two CSV files (ASR and Ground Truth in IPA format)
asr_df = pd.read_csv('/content/drive/MyDrive/transcriptions.csv')
gt_df = pd.read_csv('/content/drive/MyDrive/validate_IPA.csv', delimiter='\t')

# Rename Ground Truth columns to match ASR DataFrame column names
gt_df.columns = ['ID', 'Transcription']

# Merge on 'ID'
merged_df = pd.merge(gt_df, asr_df, on='ID', suffixes=('_gt', '_asr'))

# Function to compare two IPA transcriptions
def analyze_differences(gt, asr):
    gt_phonemes = gt.split()
    asr_phonemes = asr.split()

    matcher = difflib.SequenceMatcher(None, gt_phonemes, asr_phonemes)

    substitutions = []
    insertions = []
    deletions = []

    for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
        if opcode == 'replace':
            substitutions.append((gt_phonemes[i1:i2], asr_phonemes[j1:j2]))
        elif opcode == 'insert':
            insertions.append(asr_phonemes[j1:j2])
        elif opcode == 'delete':
            deletions.append(gt_phonemes[i1:i2])

    return substitutions, insertions, deletions

# Function to calculate Word Error Rate (WER)
def calculate_wer(gt, asr):
    gt_phonemes = gt.split()
    asr_phonemes = asr.split()

    # Get the Levenshtein distance (edit distance) between the two
    matcher = difflib.SequenceMatcher(None, gt_phonemes, asr_phonemes)
    edits = matcher.get_opcodes()

    substitutions = sum(1 for tag, i1, i2, j1, j2 in edits if tag == 'replace')
    insertions = sum(1 for tag, i1, i2, j1, j2 in edits if tag == 'insert')
    deletions = sum(1 for tag, i1, i2, j1, j2 in edits if tag == 'delete')

    # Calculate WER
    wer = (substitutions + insertions + deletions) / len(gt_phonemes)
    return wer, substitutions, insertions, deletions

# Apply to all rows
error_summary = []
for idx, row in merged_df.iterrows():
    subs, ins, dels = analyze_differences(row['Transcription_gt'], row['Transcription_asr'])

    # Calculate WER for the current row
    wer, sub_count, ins_count, del_count = calculate_wer(row['Transcription_gt'], row['Transcription_asr'])

    # Count homophones in substitutions
    homophone_matches = []
    for gt_sub, asr_sub in subs:
        if any(are_homophones(gt, asr) for gt in gt_sub for asr in asr_sub):
            homophone_matches.append((gt_sub, asr_sub))

    error_summary.append({
        'ID': row['ID'],
        'Substitutions': subs,
        'Insertions': ins,
        'Deletions': dels,
        'WER': wer,
        'Homophone Matches': homophone_matches
    })

error_df = pd.DataFrame(error_summary)

# Count error types overall
all_subs = []
all_ins = []
all_dels = []
all_homophones = []

for errors in error_summary:
    all_subs.extend(errors['Substitutions'])
    all_ins.extend(errors['Insertions'])
    all_dels.extend(errors['Deletions'])
    all_homophones.extend(errors['Homophone Matches'])

substitution_counter = Counter([f"{' '.join(gt)} --> {' '.join(asr)}" for gt, asr in all_subs])
insertion_counter = Counter([phoneme for ins in all_ins for phoneme in ins])
deletion_counter = Counter([phoneme for dels in all_dels for phoneme in dels])
homophone_counter = Counter([f"{' '.join(gt)} --> {' '.join(asr)}" for gt, asr in all_homophones])

# Print some basic analysis
print("\nTop Substitutions:")
for pair, count in substitution_counter.most_common(10):
    print(f"{pair}: {count} times")

print("\nTop Insertions (ASR added extra phonemes):")
for phoneme, count in insertion_counter.most_common(10):
    print(f"{phoneme}: {count} times")

print("\nTop Deletions (ASR missed phonemes):")
for phoneme, count in deletion_counter.most_common(10):
    print(f"{phoneme}: {count} times")

print("\nTop Homophone Matches (substituted homophones):")
for pair, count in homophone_counter.most_common(10):
    print(f"{pair}: {count} times")

# Save detailed error report
error_df.to_csv('/content/drive/MyDrive/error_analysis_report_with_wer_and_homophones_ipa.csv', index=False)
print("\nDetailed error report saved as 'error_analysis_report_with_wer_and_homophones_ipa.csv'")




Top Substitutions:
ɪ --> ə: 24 times
ə --> ɪ: 19 times
oː --> ɔː: 18 times
eː --> ɪ: 17 times
ɦ --> eː: 15 times
ɪ --> eː: 13 times
ɔː --> ə: 12 times
aː --> ə: 12 times
uː --> ʊ: 10 times
ɛː --> ə: 10 times

Top Insertions (ASR added extra phonemes):
ə: 273 times
aː: 115 times
ɪ: 110 times
eː: 103 times
k: 93 times
ɛː: 77 times
t̪: 75 times
ɾ: 74 times
b: 55 times
m: 50 times

Top Deletions (ASR missed phonemes):
ə: 244 times
eː: 100 times
ɪ: 92 times
ɾ: 71 times
ɛː: 68 times
t̪: 62 times
ɦ: 58 times
aː: 52 times
k: 45 times
b: 43 times

Top Homophone Matches (substituted homophones):
oː --> ɔː: 18 times
ɔː --> oː: 7 times
ɛː --> eː: 7 times
eː --> ɛː: 5 times
ɦ oː --> ɔː: 2 times
eː --> ɪ ɛː: 2 times
eː ɦ ə --> ɛː: 2 times
ə oː --> ɔː: 1 times
ɛː --> ə eː ə: 1 times
j oː --> iː ɦ ɔː: 1 times

Detailed error report saved as 'error_analysis_report_with_wer_and_homophones_ipa.csv'


In [None]:
import json
import torch
import os

def load_model(model_name):
    # Path to the models.json file
    models_json_path = '/content/drive/MyDrive/uni2510_model.json'  # Update this if it's located elsewhere

    # Check if the models.json file exists
    if not os.path.exists(models_json_path):
        raise FileNotFoundError(f"{models_json_path} not found. Ensure the file is present.")

    # Load the model registry (models.json)
    with open(models_json_path, 'r') as f:
        model_registry = json.load(f)

    # Check if the model_name exists in the registry
    if model_name in model_registry['models']:
        model_path = model_registry['models'][model_name]
        print(f"Loading model: {model_name} from {model_path}")

        # Ensure that the model directory exists
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model directory {model_path} does not exist.")

        # Ensure that the model file (model.pt) exists
        model_file_path = os.path.join(model_path, 'model.pt')
        if not os.path.exists(model_file_path):
            raise FileNotFoundError(f"Model file {model_file_path} not found.")

        # Load the model using PyTorch (or your preferred framework)
        try:
            # You can modify this to load the model in a different way if needed
            model = torch.load(model_file_path)
            print(f"Model {model_name} loaded successfully.")
            return model

        except Exception as e:
            raise RuntimeError(f"Error loading model {model_name}: {e}")

    else:
        raise ValueError(f"Model {model_name} not found in the model registry.")

# Example usage:
try:
    model_name = "uni2510"  # Use your model name here
    model = load_model(model_name)
    # Perform further operations with the model here
except Exception as e:
    print(f"Error: {e}")


Loading model: uni2510 from /content/drive/MyDrive/uni2510
Model uni2510 loaded successfully.


Collecting allosaurus@ git+https://github.com/pswiercz/allosaurus.git
  Cloning https://github.com/pswiercz/allosaurus.git to /tmp/pip-install-mmsrllk2/allosaurus_b7ed71d331b24489bdeee87d91bd4067
  Running command git clone --filter=blob:none --quiet https://github.com/pswiercz/allosaurus.git /tmp/pip-install-mmsrllk2/allosaurus_b7ed71d331b24489bdeee87d91bd4067
  fatal: could not read Username for 'https://github.com': No such device or address
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mgit clone --[0m[32mfilter[0m[32m=[0m[32mblob[0m[32m:none --quiet [0m[4;32mhttps://github.com/pswiercz/allosaurus.git[0m[32m [0m[32m/tmp/pip-install-mmsrllk2/[0m[32mallosaurus_b7ed71d331b24489bdeee87d91bd4067[0m did not run successfully.
  [31m│[0m exit code: [1;36m128[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0m: [1msubprocess-exited-wi

In [7]:
import torch
import os
from allosaurus.predictor import Predictor

# 1. Load the Predictor
def load_predictor():
    # Path to the custom model directory (where model.pt exists)
    model_dir = "/content/drive/MyDrive/"
    model_tag = "uni2510"

    # Create the Predictor with the model
    predictor = Predictor(model_tag=model_tag, model_dir=model_dir)
    print(f"Loaded predictor with model: {model_tag}")
    return predictor

# 2. Recognize phonemes from a wav file
def recognize_phonemes(predictor, wav_path):
    phoneme_sequence = predictor.predict(wav_path, lang_id="eng")  # assuming the language ID is "eng"
    return phoneme_sequence

# Convert phoneme sequence to Hindi text
def ipa_to_hindi(phoneme_sequence):
    # Define your custom IPA to Hindi mapping
    ipa_to_hindi_map = {
        "ə": "अ",
        "aː": "आ",
        "ɪ": "इ",
        "iː": "ई",
        "ʊ": "उ",
        "uː": "ऊ",
        "eː": "ए",
        "oː": "ओ",
        "ɛː": "ऐ",
        "ɔː": "औ",

        "k": "क",
        "kʰ": "ख",
        "ɡ": "ग",
        "ɡ̤": "घ",
        "ŋ": "ङ",

        "ʈʃ": "च",
        "ʈʃʰ": "छ",
        "ɟ": "ज",
        "ɟʱ": "झ",
        "ɲ": "ञ",

        "ʈ": "ट",
        "ʈʰ": "ठ",
        "ɖ": "ड",
        "ɖʱ": "ढ",
        "ɳ": "ण",

        "t̪": "त",
        "t̪ʰ": "थ",
        "d̪": "द",
        "d̪ʱ": "ध",
        "n": "न",

        "p": "प",
        "pʰ": "फ",
        "b": "ब",
        "bʱ": "भ",
        "m": "म",

        "j": "य",
        "ɾ": "र",
        "l": "ल",
        "ʋ": "व",

        "ʃ": "श",
        "ʂ": "ष",
        "s": "स",
        "h": "ह",

        "ŋ": "ङ",
        "ɦ": "ह",

        "æ": "ऐ",
        "f": "फ़",
        "z": "ज़",
        "ʒ": "ज़",
    }

    phonemes = phoneme_sequence.split()
    hindi_output = ""

    for phoneme in phonemes:
        hindi_char = ipa_to_hindi_map.get(phoneme, phoneme)  # fallback: keep phoneme if no mapping
        hindi_output += hindi_char

    return hindi_output

# 4. Full Pipeline: WAV to Hindi Text
def process_audio(wav_path):
    predictor = load_predictor()

    # Step 1: Recognize phonemes
    phoneme_sequence = recognize_phonemes(predictor, wav_path)
    print(f"🎯 Phoneme Sequence: {phoneme_sequence}")

    # Step 2: Convert phonemes to Hindi
    hindi_text = ipa_to_hindi(phoneme_sequence)
    print(f"📝 Hindi Text: {hindi_text}")

    return hindi_text

# 5. Example Usage
if __name__ == "__main__":
    wav_file = "/path/to/your_audio.wav"  # ✅ Set your correct WAV file path here
    hindi_text = process_audio(wav_file)
    print("\n✅ Final Hindi Output:", hindi_text)

ModuleNotFoundError: No module named 'allosaurus.predictor'

In [None]:
import os
import torchaudio
from allosaurus.app import read_recognizer

# Load the model
def load_model():
    # Load the default recognizer (or you can pass a specific model name)
    model = read_recognizer()
    print("Model loaded successfully.")
    return model

# Function to perform transcription on audio
def transcribe_audio(model, audio_file_path):
    # Just pass the path to model.recognize
    transcription = model.recognize(audio_file_path)
    return transcription

# Example usage
model = load_model()

# Path to the WAV file you want to transcribe
audio_file_path = '/content/drive/MyDrive/Dataset_allosaurus/validate/audio/281474976714320.wav'

# Transcribe the audio
transcription = transcribe_audio(model, audio_file_path)

print("Transcription:", transcription)

Model loaded successfully.
Transcription: m ɪ s̪ a d ɪ d uə e ɾ ɒ w e
