In [3]:
! pip install librosa
! pip install jiwer
! pip install gradio
! pip  install datasets
! pip install evaluate
!pip install ipywidgets



In [None]:
!pip install evaluate jiwer

In [None]:
from datasets import load_from_disk
import pandas as pd

# Define the datasets dictionary
datasets = {
    "AB": "ganga4364/benchmark-stt-AB",
    "CS": "ganga4364/benchmark-stt-CS",
    "HS": "ganga4364/benchmark-stt-HS",
    "MV": "ganga4364/benchmark-stt-MV",
    "NS": "ganga4364/benchmark-stt-NS",
    "NW": "ganga4364/benchmark-stt-NW",
    "PC": "ganga4364/benchmark-stt-PC",
    "TT": "ganga4364/benchmark-stt-TT"
}


In [17]:
bm_inf_df = pd.read_csv("./wav2vec2/benchmark_v3_inference.csv")  # Replace with your actual file path

In [None]:
from evaluate import load

cer_metric = load("cer")

In [None]:
def calculate_cer(row):
    try:
        uni = row['uni']
        inf_uni = row['inf']
        cer = cer_metric.compute(references=[uni], predictions=[inf_uni])
        cer = min(cer, 1.0)
        return cer
    except:
        return 0.0

In [None]:

# Initialize the organized_datasets dictionary
organized_datasets = {}

# Process each dataset
for dataset_name, url in datasets.items():
    print(f"Processing {dataset_name} dataset...")
    
    # Load the dataset
    dataset = load_from_disk(f"./downloaded_datasets/{dataset_name}")
    
    # Initialize the structure for this dataset
    organized_datasets[dataset_name] = {
        "splits": {},
    }
    
    # Process each split in the dataset
    for split_name in dataset.keys():
        split_df = dataset[split_name].to_pandas()
        file_names = split_df['file_name'].tolist()
        
        # Merge the split DataFrame with the larger DataFrame on 'file_name'
        merged_df = pd.merge(split_df, bm_inf_df, on='file_name', how='inner')
        
        merged_df['cer'] = merged_df.apply(calculate_cer, axis=1)
        #merged_df['cer_95'] = merged_df.apply(calculate_cer_95, axis=1)
        # Update the organized_datasets dictionary
        organized_datasets[dataset_name]["splits"][split_name] = {
            "dataframe": merged_df,
            "count": len(merged_df)
        }

# Print summary
for dataset_name, dataset_info in organized_datasets.items():
    for split_name, split_info in dataset_info['splits'].items():
        print(f"  - {split_name}: {split_info['count']} files")


In [None]:
!pip install tabulate

In [None]:
import pandas as pd
import numpy as np

# Initialize dictionaries to store results
split_summaries = {}
dataset_summaries = {}

# Process each dataset
for dataset_name, dataset_info in organized_datasets.items():
    print(f"\nProcessing {dataset_name} dataset:")
    dataset_cer = []  # Store CER values for the dataset
    
    # Process each split in the dataset
    for split_name, split_info in dataset_info['splits'].items():
        df = split_info['dataframe']
        
        # Calculate mean CER for the split
        mean_cer = df['cer'].mean()
        
        # Store split summary
        split_summaries[(dataset_name, split_name)] = {
            'mean_cer': mean_cer,
            'count': len(df)
        }
        
        # Accumulate CER values for dataset average
        dataset_cer.extend(df['cer'])
    
    # Calculate dataset average CER
    dataset_mean_cer = np.mean(dataset_cer)
    
    # Store dataset summary
    dataset_summaries[dataset_name] = {
        'mean_cer': dataset_mean_cer,
        'total_files': len(dataset_cer)
    }
    
    # Print dataset summary
    print(f"\nDataset Summary for {dataset_name}:")
    print(f"  Total files: {len(dataset_cer)}")
    print(f"  Average CER: {dataset_mean_cer:.4f}")

# Calculate overall average CER across all datasets
all_cer = [summary['mean_cer'] for summary in dataset_summaries.values()]
overall_mean_cer = np.mean(all_cer)

print("\nOverall Summary:")
print(f"Average CER across all datasets: {overall_mean_cer:.4f}")
