In [1]:
!pip install jiwer -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h

In [2]:
import pandas as pd
from tqdm import tqdm
from glob import glob
import os
import librosa
tqdm.pandas()
import requests
from transformers import pipeline
from pathlib import Path
from jiwer import wer, cer
import numpy as np

In [3]:
# Read the Excel file - replace 'your_file.xlsx' with your actual file name
df = pd.read_csv('/kaggle/input/interspeech-2025/wav2vec2_ansary/wav2vec2_ansary/fb_w2v2_outputs.csv')

# Get the number of columns
num_columns = len(df.columns)
print(f"Number of columns in the dataset: {num_columns}")

# Get column names
print("\nColumn names:")
print(df.columns.tolist())

# Get distinct values in District column
distinct_districts = df['district'].unique()
num_distinct_districts = len(distinct_districts)

print(f"\nNumber of distinct districts: {num_distinct_districts}")
print("\nList of distinct districts:")
print(sorted(distinct_districts.tolist()))  # Sorted for better readability

Number of columns in the dataset: 5

Column names:
['model', 'district', 'file_name', 'prediction', 'transcripts']

Number of distinct districts: 15

List of distinct districts:
['barishal', 'chittagong', 'comilla', 'habiganj', 'jhenaidah', 'kishoreganj', 'lakshmipur', 'narail', 'narsingdi', 'nilphamari', 'noakhali', 'rangpur', 'sandwip', 'sylhet', 'tangail']


In [6]:
def calc_wer_cer(ground_truth,prediction):

    ground_truth = str(ground_truth)
    prediction = str(prediction)

    WER = round(wer(ground_truth, prediction),6)
    CER = round(cer(ground_truth, prediction),6)

    return WER, CER

In [8]:


# Read the CSV file
df = pd.read_csv('/kaggle/input/interspeech-2025/wav2vec2_ansary/wav2vec2_ansary/fb_w2v2_outputs.csv')

# Create lists to store district-wise results
districts = []
wer_values = []
cer_values = []
sample_counts = []

# Calculate WER and CER for each district
for district in sorted(df['district'].unique()):
    district_data = df[df['district'] == district]
    district_wer = []
    district_cer = []
    
    # Calculate WER and CER for each row in the district
    for _, row in district_data.iterrows():
        wer_val, cer_val = calc_wer_cer(row['transcripts'], row['prediction'])
        district_wer.append(wer_val)
        district_cer.append(cer_val)
    
    # Calculate mean WER and CER for the district with more precision
    mean_wer = round(sum(district_wer) / len(district_wer), 6)  # Changed from 3 to 6 decimal places
    mean_cer = round(sum(district_cer) / len(district_cer), 6)  # Changed from 3 to 6 decimal places
    
    # Store results
    districts.append(district)
    wer_values.append(mean_wer)
    cer_values.append(mean_cer)
    sample_counts.append(len(district_data))

# Create results DataFrame
results_df = pd.DataFrame({
    'District': districts,
    'WER': wer_values,
    'CER': cer_values,
    'Samples': sample_counts
})

# Calculate overall metrics with more precision
overall_wer = round(results_df['WER'].mean(), 6)  # Changed from 3 to 6 decimal places
overall_cer = round(results_df['CER'].mean(), 6)  # Changed from 3 to 6 decimal places
total_samples = results_df['Samples'].sum()

# Display results with more precise formatting
pd.set_option('display.float_format', lambda x: '%.6f' % x)  # Added this line for better float formatting
print("District-wise WER and CER values:")
print(results_df.to_string(index=False))
print("\nOverall Metrics:")
print(f"Overall WER: {overall_wer:.6f}")
print(f"Overall CER: {overall_cer:.6f}")
print(f"Total Samples: {total_samples}")

# Optional: Save results to CSV
results_df.to_csv('district_wise_metrics.csv', index=False, float_format='%.6f')

District-wise WER and CER values:
   District      WER      CER  Samples
   barishal 1.000000 0.999390      101
 chittagong 1.000000 0.999734      176
    comilla 1.000000 0.999001       32
   habiganj 1.000000 0.994722      117
  jhenaidah 1.000000 0.996379       15
kishoreganj 1.000000 0.997672      205
 lakshmipur 1.000000 0.998190        4
     narail 1.000000 0.998810      186
  narsingdi 1.000000 0.997677      137
 nilphamari 1.000000 0.996733       10
   noakhali 1.000000 0.995723       28
    rangpur 1.000000 0.998861      130
    sandwip 1.000000 0.998213      131
     sylhet 1.000000 0.999166      762
    tangail 1.000000 0.999278      127

Overall Metrics:
Overall WER: 1.000000
Overall CER: 0.997970
Total Samples: 2161
