In [None]:
import pandas as pd
import soundfile as sf
from pydub import AudioSegment
import tqdm
import matplotlib.pyplot as plt
import librosa
import seaborn as sns
from jiwer import wer
from jiwer import cer

In [None]:
pip install jiwer

# **Train Split**

In [None]:
df = pd.read_csv("/kaggle/input/ben10/ben10/16_kHz_train_audio/train.csv",sep=",")
df.head()

# *NISQA analysis*

In [None]:
# Load the NISQA results
df_qa = pd.read_csv("/kaggle/input/bengali-regional-speech-ben10-data-nisqa-metrics/Bangla_Dialect_NISQA_results(train).csv",sep=",")
df_qa.rename(columns={'deg':'id'},inplace=True) ## rename to match other dfs
df_qa['id'] = df_qa['id'].apply(lambda x:x.split('.')[0])  ## remove .wav
df_qa.sort_values('mos_pred',ascending=True,inplace=True) ## sort by mean overall score
df_qa

In [None]:
ood_qa = pd.read_csv("/kaggle/input/bengaliai-speech-train-nisqa/NISQA_wavfiles.csv",sep=",")
ood_qa.rename(columns={'deg':'id'},inplace=True) ## rename to match other dfs
ood_qa['id'] = ood_qa['id'].apply(lambda x:x.split('.')[0])  ## remove .wav
ood_qa.sort_values('mos_pred',ascending=True,inplace=True) ## sort by mean overall score
ood_qa

In [None]:
## Listen to lowest overall quality samples

path = "/kaggle/input/ben10/ben10/16_kHz_train_audio/"
file = "train_habiganj (909)" 

# The sample contains lots of background noise,
# interference of others, mobile ringing etc.

print(file)
display(AudioSegment.from_file(path+file+'.wav'))
# df[df['id']==file].sentence.tolist()[0] 
pd.set_option('display.max_colwidth', None)
print(df[df['file_name']==file+'.wav'].transcriptions)
pd.reset_option('display.max_colwidth')

In [None]:
#df_qa['mos_pred'].hist(bins=100)

df_qa['mos_pred'].plot(kind='hist', bins=50 ,edgecolor='white')
plt.xlabel('Mean Opinion Score',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.savefig('/kaggle/working/Train MOS.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
ood_qa['mos_pred'].plot(kind='hist', bins=50 ,edgecolor='white')
plt.xlabel('Mean Opinion Score',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
# plt.savefig('/kaggle/working/Train MOS.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot histogram for the ben10 dataset
df_qa['mos_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5,label='ben10')

# Plot histogram for the OOD Speech dataset
ood_qa['mos_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5, label='OOD Speech')

# Add labels and legend
plt.xlabel('Mean Opinion Score', fontsize=20)
plt.ylabel('Frequency', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.legend(loc='upper right', fontsize=12)

# Save the figure if needed
# plt.savefig('/kaggle/working/Compare_MOS.pdf', bbox_inches='tight', dpi=100)

# Show the plot
plt.show()


In [None]:
# Understanding the density metric

# import matplotlib.pyplot as plt
# import numpy as np

# # Plot histogram for the ben10 dataset with density=False to get frequencies
# count_ben10, bins_ben10, _ = plt.hist(df_qa['mos_pred'], bins=50, edgecolor='white', alpha=0.5, label='ben10', density=False)

# # Plot histogram for the OOD Speech dataset with density=False to get frequencies
# count_ood, bins_ood, _ = plt.hist(ood_qa['mos_pred'], bins=50, edgecolor='white', alpha=0.5, label='OOD Speech', density=False)

# # Calculate total samples for each dataset
# # N_ben10 = len(df_qa['mos_pred'])
# # N_OOD = len(ood_qa['mos_pred'])

# # # Bin width
# # bin_width = bins_ben10[1] - bins_ben10[0]

# # # Plot the density histogram for ben10
# # density_ben10 = count_ben10 / (N_ben10 * bin_width)
# # plt.hist(df_qa['mos_pred'], bins=50, edgecolor='white', alpha=0.5, label='ben10', density=True)

# # # Plot the density histogram for OOD Speech
# # density_ood = count_ood / (N_OOD * bin_width)
# # plt.hist(ood_qa['mos_pred'], bins=50, edgecolor='white', alpha=0.5, label='OOD Speech', density=True)

# # Add labels and legend
# plt.xlabel('Mean Opinion Score', fontsize=20)
# plt.ylabel('Density', fontsize=20)
# plt.tick_params(axis='both', which='major', labelsize=12)
# plt.legend(loc='upper right', fontsize=12)

# # Save the figure if needed
# # plt.savefig('/kaggle/working/Compare_MOS.pdf', bbox_inches='tight', dpi=100)

# # Show the plot
# plt.show()

# # Print total frequencies for verification
# print("Total frequencies for ben10 dataset:", np.sum(count_ben10))
# print("Total frequencies for OOD Speech dataset:", np.sum(count_ood))

# print((count_ben10))


In [None]:
import matplotlib.pyplot as plt

# Plot histogram for the ben10 dataset
df_qa['mos_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5, density = True,label='ben10')

# Plot histogram for the OOD Speech dataset
ood_qa['mos_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5, density=True, label='OOD Speech')

# Add labels and legend
plt.xlabel('Mean Opinion Score', fontsize=20)
plt.ylabel('Relative Density', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.legend(loc='upper right', fontsize=12)

# Save the figure if needed
plt.savefig('/kaggle/working/Compare_MOS.pdf', bbox_inches='tight', dpi=100)

# Show the plot
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Compute the histogram for ben10 dataset
ben10_hist, ben10_bins = np.histogram(df_qa['mos_pred'], bins=50, density=True)

# Compute the histogram for OOD Speech dataset
ood_hist, ood_bins = np.histogram(ood_qa['mos_pred'], bins=50, density=True)

# Plot histogram for the ben10 dataset
plt.hist(df_qa['mos_pred'], bins=50, edgecolor='white', alpha=0.5, density=True, label='ben10')

# Plot histogram for the OOD Speech dataset
plt.hist(ood_qa['mos_pred'], bins=50, edgecolor='white', alpha=0.5, density=True, label='OOD Speech')

# Add labels and legend
plt.xlabel('Mean Opinion Score', fontsize=20)
plt.ylabel('Relative Density', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.legend(loc='upper right', fontsize=12)

# Show the plot
plt.show()

# Verify the densities
print(f"Sum of ben10 histogram densities: {np.sum(ben10_hist * np.diff(ben10_bins))}")
print(f"Sum of OOD Speech histogram densities: {np.sum(ood_hist * np.diff(ood_bins))}")
print('OOD BINS : ',ood_hist * np.diff(ben10_bins))
print('Ben10 BINS : ',ben10_hist * np.diff(ben10_bins))



In [None]:
df_qa['noi_pred'].plot(kind='hist', bins=50,edgecolor='white')
plt.xlabel('Noisiness',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.savefig('/kaggle/working/Train Noisiness.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot histogram for the ben10 dataset
df_qa['noi_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5, density = True,label='ben10')

# Plot histogram for the OOD Speech dataset
ood_qa['noi_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5, density=True, label='OOD Speech')

# Add labels and legend
plt.xlabel('Noisiness', fontsize=20)
plt.ylabel('Relative Density', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.legend(loc='upper right', fontsize=12)

# Save the figure if needed
plt.savefig('/kaggle/working/Compare_noi.pdf', bbox_inches='tight', dpi=100)

# Show the plot
plt.show()


In [None]:
df_qa['dis_pred'].plot(kind='hist', bins=50,edgecolor='white')
plt.xlabel('Discontinuity',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.savefig('/kaggle/working/Train Disc.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot histogram for the ben10 dataset
df_qa['dis_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5, density = True,label='ben10')

# Plot histogram for the OOD Speech dataset
ood_qa['dis_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5, density=True, label='OOD Speech')

# Add labels and legend
plt.xlabel('Discontinuity', fontsize=20)
plt.ylabel('Relative Density', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.legend(loc='upper right', fontsize=12)

# Save the figure if needed
plt.savefig('/kaggle/working/Compare_dis.pdf', bbox_inches='tight', dpi=100)

# Show the plot
plt.show()


In [None]:
df_qa['col_pred'].plot(kind='hist', bins=50 ,edgecolor='white')
plt.xlabel('Coloration',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.savefig('/kaggle/working/Train Col.pdf',bbox_inches='tight',dpi=100)

In [None]:
import matplotlib.pyplot as plt

# Plot histogram for the ben10 dataset
df_qa['col_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5, density = True,label='ben10')

# Plot histogram for the OOD Speech dataset
ood_qa['col_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5, density=True, label='OOD Speech')

# Add labels and legend
plt.xlabel('Coloration', fontsize=20)
plt.ylabel('Relative Density', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.legend(loc='upper right', fontsize=12)

# Save the figure if needed
plt.savefig('/kaggle/working/Compare_col.pdf', bbox_inches='tight', dpi=100)

# Show the plot
plt.show()


In [None]:
df_qa['loud_pred'].plot(kind='hist', bins=50,edgecolor='white')
plt.xlabel('Loudness',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.savefig('/kaggle/working/Train Loud.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot histogram for the ben10 dataset
df_qa['loud_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5, density = True,label='ben10')

# Plot histogram for the OOD Speech dataset
ood_qa['loud_pred'].plot(kind='hist', bins=50, edgecolor='white', alpha=0.5, density=True, label='OOD Speech')

# Add labels and legend
plt.xlabel('Loudness', fontsize=20)
plt.ylabel('Relative Density', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.legend(loc='upper right', fontsize=12)

# Save the figure if needed
plt.savefig('/kaggle/working/Compare_loud.pdf', bbox_inches='tight', dpi=100)

# Show the plot
plt.show()


In [None]:
## lets threshold at the notch in the histogram

df_qa[df_qa.mos_pred>1.5].sort_values('mos_pred',ascending=True).head()

In [None]:
## lets listen to samples with MOS around 1.5

path = "/kaggle/input/ben10/ben10/16_kHz_train_audio/"
file = "train_habiganj (246)" 

##  ground truth matches but some recording issue is still prevelant and recording quality is low.

print(file)
display(AudioSegment.from_file(path+file+'.wav'))
# df[df['id']==file].sentence.tolist()[0]
pd.set_option('display.max_colwidth', None)
print(df[df['file_name']==file+'.wav'].transcripts)
pd.reset_option('display.max_colwidth') 

In [None]:
#Extract the audio lengths and append to a dataframe
audio_lengths = []
path = "/kaggle/input/ben10/ben10/16_kHz_train_audio/"
for idx, row in df.iterrows():
    file_name = row['file_name']
    duration = librosa.get_duration(path=path+file_name)
    audio_lengths.append(duration)
    
length_df = df.copy()    
length_df['audio_length'] = audio_lengths
length_df.head()

In [None]:
# histogram of audio length distribution
length_df['audio_length'].plot(kind='hist', bins=80 ,edgecolor='white')
plt.xlabel('Audio Length (seconds)',fontsize=20)
plt.ylabel('Frequency(Log Scale)',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.yscale('log')
plt.savefig('/kaggle/working/Train audio_length dist.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
length_df[length_df['audio_length']<1]

In [None]:
#Extract the transcription character lengths and append to a dataframe
transcript_lengths = []
for idx, row in df.iterrows():
    transcript = row['transcriptions']
    length = len(transcript)
    transcript_lengths.append(length)
    

length_df['transcript_length'] = transcript_lengths
length_df.head()


In [None]:
# histogram of transcript length distribution
length_df['transcript_length'].plot(kind='hist', bins=100 ,edgecolor='white')
plt.xlabel('Transcription Length (characters)',fontsize=20)
plt.ylabel('Frequency(Log Scale)',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.yscale('log')
plt.savefig('/kaggle/working/Train transcript_length dist.pdf',bbox_inches='tight',dpi=100)

In [None]:
length_df[length_df['transcript_length']<5]

In [None]:
correlation = length_df['audio_length'].corr(length_df['transcript_length'])
print(f"Correlation between audio length and transcript length: {correlation:.2f}")

In [None]:
# Create a scatter plot
sns.scatterplot(x='audio_length', y='transcript_length', data=length_df)

# plt.title('Scatter Plot of Audio Length vs. Transcription Length')
plt.xlabel('Audio Length (seconds)',fontsize=20)
plt.ylabel('Transcription Length (characters)',fontsize=17)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.savefig('/kaggle/working/Train audio_transcript_length corr.pdf',bbox_inches='tight',dpi=100)
plt.show()


In [None]:
length_df.loc[(length_df['transcript_length']<5) & (length_df['audio_length']>10)]

In [None]:

df_vad = pd.read_csv("/kaggle/input/bangla-regional-speech-ben10-data-vad-analysis/VAD Analysis Train.csv",sep=",")
df_vad.rename(columns={'name':'id'},inplace=True) ## rename to match other dfs
df_vad['id'] = df_vad['id'].apply(lambda x:x.split('.')[0])  ## remove .wav
df_vad.drop(columns=['Unnamed: 0'], inplace=True) ##remove empty column
df_vad.sort_values('silence',ascending=True,inplace=True) ## sort by silence
df_vad.head()

In [None]:
df_vad['district'] = df_vad['id'].str.extract(r'train_([a-zA-Z]+)')
grouped_df = df_vad.groupby('district')[['silence', 'voice']].sum().reset_index()
grouped_df.columns = ['district', 'total_silence', 'total_voice']
grouped_df['silence_ratio(%)'] = (grouped_df['total_silence']*100/grouped_df['total_voice'])
grouped_df.to_csv('District-wise VAD Analysis Train.csv')
grouped_df.tail()

In [None]:
dft_grouped_vad = pd.read_csv("/kaggle/input/bangla-regional-speech-ben10-data-vad-district/District-wise VAD Analysis Test.csv",sep=",")
dft_grouped_vad.drop(columns=['Unnamed: 0'], inplace=True) ##remove empty column
dft_grouped_vad.rename(columns={'silence_ratio(%)':'silence_ratio'},inplace=True)
dft_grouped_vad.head()

# *WER and CER*

In [None]:
##Creating WER & CER dataframe
Districts = ['chittagong','kishoreganj','narsingdi','narail', 'rangpur','tangail', 'habiganj', 'barishal', 'sylhet']
WER = [0.93, 0.92, 0.82, 0.78, 0.81, 0.60, 0.84, 0.84, 0.85]
CER = [0.59, 0.69, 0.54, 0.46, 0.53, 0.30, 0.49, 0.58, 0.53]

df_err = pd.DataFrame({
    'Districts': Districts,
    'WER': WER,
    'CER': CER
})

df_err.head()

In [None]:
silence_ratios = []
for idx, row in df_err.iterrows():
    dist = row['Districts']
    silence_ratios.append(dft_grouped_vad[dft_grouped_vad['district']==dist].silence_ratio.iloc[0])

df_err['silence_ratios']=silence_ratios
df_err.head(10)


In [None]:
corr_wer = df_err['silence_ratios'].corr(df_err['WER'])
corr_cer = df_err['silence_ratios'].corr(df_err['CER'])


# Print the correlation coefficient
print(f"Correlation between WER and silence_ratios: {corr_wer:.2f}")
print(f"Correlation between CER and silence_ratios: {corr_cer:.2f}")

# **Validation Split**

# *NISQA Analysis*

In [None]:
dfv_qa = pd.read_csv("/kaggle/input/bengali-regional-speech-ben10-data-nisqa-metrics/NISQA_results(validation).csv",sep=",")
dfv_qa.rename(columns={'deg':'id'},inplace=True) ## rename to match other dfs
dfv_qa['id'] = dfv_qa['id'].apply(lambda x:x.split('.')[0])  ## remove .wav
dfv_qa.sort_values('mos_pred',ascending=True,inplace=True) ## sort by mean overall score
dfv_qa.head()

In [None]:
## Listen to lowest overall quality samples

path = "/kaggle/input/ben10/ben10/16_kHz_valid_audio/"
file = "valid_sylhet (144)" 

# The sample contains lots of background noise,
# interference of others, mobile ringing etc.

print(file)
display(AudioSegment.from_file(path+file+'.wav'))
# df[df['id']==file].sentence.tolist()[0] 
pd.set_option('display.max_colwidth', None)
print('Transcript: Not Publicly Available for Validation Set')
#print(dfv[dfv['file_name']==file+'.wav'].transcripts)
pd.reset_option('display.max_colwidth')

In [None]:
dfv_qa['mos_pred'].plot(kind='hist', bins=50 ,edgecolor='white')
plt.xlabel('Mean Opinion Score',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.savefig('/kaggle/working/Val MOS.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
dfv_qa['noi_pred'].plot(kind='hist', bins=50 ,edgecolor='white')
plt.xlabel('Noisiness',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.savefig('/kaggle/working/Val Noisiness.pdf',bbox_inches='tight',dpi=100)

In [None]:
dfv_qa['dis_pred'].plot(kind='hist', bins=50 ,edgecolor='white')
plt.xlabel('Discontinuity',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.savefig('/kaggle/working/Val Disc.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
dfv_qa['col_pred'].plot(kind='hist', bins=50 ,edgecolor='white')
plt.xlabel('Coloration',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.savefig('/kaggle/working/Val Col.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
dfv_qa['loud_pred'].plot(kind='hist', bins=50 ,edgecolor='white')
plt.xlabel('Loudness',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.savefig('/kaggle/working/Val Loud.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
dfv_qa

In [None]:
dfv_vad = pd.read_csv("/kaggle/input/bangla-regional-speech-ben10-data-vad-analysis/VAD Analysis Val.csv",sep=",")
dfv_vad.rename(columns={'name':'id'},inplace=True) ## rename to match other dfs
dfv_vad['id'] = dfv_vad['id'].apply(lambda x:x.split('.')[0])  ## remove .wav
dfv_vad.drop(columns=['Unnamed: 0'], inplace=True) ##remove empty column
dfv_vad.sort_values('silence',ascending=True,inplace=True) ## sort by silence
dfv_vad.head()

In [None]:
dfv_vad['district'] = dfv_vad['id'].str.extract(r'valid_([a-zA-Z]+)')
grouped_dfv = dfv_vad.groupby('district')[['silence', 'voice']].sum().reset_index()
grouped_dfv.columns = ['district', 'total_silence', 'total_voice']
grouped_dfv['silence_ratio(%)'] = (grouped_dfv['total_silence']*100/grouped_dfv['total_voice'])
grouped_dfv.to_csv('District-wise VAD Analysis Val.csv')
grouped_dfv.tail()

# ****Test Split****

# *NISQA Analysis*

In [None]:
df_qta = pd.read_csv("/kaggle/input/bengali-regional-speech-ben10-data-nisqa-metrics/Bangla_Dialect_NISQA_results(test).csv",sep=",")
df_qta.rename(columns={'deg':'id'},inplace=True) ## rename to match other dfs
df_qta['id'] = df_qta['id'].apply(lambda x:x.split('.')[0])  ## remove .wav
df_qta.sort_values('mos_pred',ascending=True,inplace=True) ## sort by mean overall score
df_qta.head()

In [None]:
df_qta['mos_pred'].plot(kind='hist', bins=100 , title='Distribution of Mean Opinion Score(MOS) in Test Set' , xlabel='x axis')
plt.savefig('/kaggle/working/Test MOS.pdf',bbox_inches='tight',dpi=100)

In [None]:
df_qta['noi_pred'].plot(kind='hist', bins=100 , title='Distribution of Noisiness in Test Set' , xlabel='x axis')
plt.savefig('/kaggle/working/Test Noisiness.pdf',bbox_inches='tight',dpi=100)

In [None]:
df_qta['dis_pred'].plot(kind='hist', bins=100 , title='Distribution of Discontinuity in Test Set' , xlabel='x axis')
plt.savefig('/kaggle/working/Test Disc.pdf',bbox_inches='tight',dpi=100)

In [None]:
df_qta['col_pred'].plot(kind='hist', bins=100 , title='Distribution of Coloration in Test Set' , xlabel='x axis')
plt.savefig('/kaggle/working/Test Col.pdf',bbox_inches='tight',dpi=100)

In [None]:
df_qta['loud_pred'].plot(kind='hist', bins=100 , title='Distribution of Loudness in Test Set' , xlabel='x axis')
plt.savefig('/kaggle/working/Test Loud.pdf',bbox_inches='tight',dpi=100)

In [None]:
df_qta.head()

# *WER and CER*

In [None]:
df_qta['district'] = df_qta['id'].str.extract(r'test_([a-zA-Z]+)')
grouped_df_qta = df_qta.groupby('district')[['mos_pred','noi_pred','dis_pred','col_pred','loud_pred']].sum().reset_index()
grouped_df_qta.columns = ['district', 'total_mos', 'total_noi','total_dis','total_col','total_loud']
entry_count = df_qta.groupby('district').size().reset_index(name='entry_count')
grouped_df_qta = grouped_df_qta.merge(entry_count, on='district')
grouped_df_qta['avg_mos'] = grouped_df_qta['total_mos']/grouped_df_qta['entry_count']
grouped_df_qta['avg_noi'] = grouped_df_qta['total_noi']/grouped_df_qta['entry_count']
grouped_df_qta['avg_dis'] = grouped_df_qta['total_dis']/grouped_df_qta['entry_count']
grouped_df_qta['avg_col'] = grouped_df_qta['total_col']/grouped_df_qta['entry_count']
grouped_df_qta['avg_loud'] = grouped_df_qta['total_loud']/grouped_df_qta['entry_count']


grouped_df_qta.head()

In [None]:
merged_df = pd.merge(df_err, grouped_df_qta, left_on='Districts', right_on='district')

# Calculate the correlation matrix
columns_to_correlate = ['avg_mos', 'avg_noi', 'avg_dis', 'avg_col', 'avg_loud', 'WER', 'CER']
correlation_matrix = merged_df[columns_to_correlate].corr()

# Filter the correlation matrix to only include the metrics with WER and CER
correlation_with_wer_cer = correlation_matrix.loc[['WER', 'CER'], ['avg_mos', 'avg_noi', 'avg_dis', 'avg_col', 'avg_loud']]

# Print the filtered correlation matrix
print(correlation_with_wer_cer)

In [None]:
correlation_matrix
correlation_matrix.to_csv('Correlation_matrix.csv')

In [None]:
plt.figure(figsize=(5, 1))  # Optional: Adjust the figure size
sns.heatmap(correlation_with_wer_cer, annot=True, cmap='coolwarm', fmt='.2f')

# Customize the plot
# plt.title('Correlation Matrix')
plt.savefig('/kaggle/working/Correlation_Heatmap_wer_cer.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
df_oinf = pd.read_csv("/kaggle/input/test-data-inference/original_inference.csv",sep=",")
df_oinf.rename(columns={'file_name':'id'},inplace=True) ## rename to match other dfs
df_oinf['id'] = df_oinf['id'].apply(lambda x:x.split('.')[0])  ## remove .wav
df_oinf['id'] = df_oinf['id'].str.replace('16kHz_test_audio/', '') ## remove  pre-text
df_oinf.head(10)

In [None]:
df_tinf = pd.read_csv("/kaggle/input/test-data-inference/trained_inference.csv",sep=",")
df_tinf.rename(columns={'file_name':'id'},inplace=True) ## rename to match other dfs
df_tinf['id'] = df_tinf['id'].apply(lambda x:x.split('.')[0])  ## remove .wav
df_tinf['id'] = df_tinf['id'].str.replace('16kHz_test_audio/', '') ## remove  pre-text
df_tinf.head(10)

In [None]:
df_tinf['wer'] = df_tinf.apply(lambda row: wer(row['transcripts'], row['sentence']), axis=1)
df_tinf['cer'] = df_tinf.apply(lambda row: cer(row['transcripts'], row['sentence']), axis=1)

# Display the DataFrame with WER values
df_tinf.head(100)

In [None]:
sorted_df = df_tinf.sort_values(by='wer', ascending=True)
sorted_df

In [None]:
df_qta.head()

In [None]:
merged_df = pd.merge(df_tinf, df_qta,on='id')
merged_df['district'] = merged_df['id'].str.extract(r'test_([a-zA-Z]+)')
merged_df.head()

In [None]:
merged_df['district'].unique()

In [None]:
Districts = ['chittagong','kishoreganj','narsingdi','narail', 'rangpur','tangail', 'habiganj', 'barishal', 'sylhet']
wer_results = []
cer_results = []
metrics = ['mos_pred','noi_pred','dis_pred','col_pred','loud_pred']

for i in range(len(Districts)):
    filtered_df = merged_df[merged_df['district']==Districts[i]]
    for j in range(len(metrics)):
        corr_wer = filtered_df[metrics[j]].corr(filtered_df['wer'])
        corr_cer = filtered_df[metrics[j]].corr(filtered_df['cer'])

        wer_result = {
            'WER(Districts)': Districts[i],
            'NISQA metrics': metrics[j],
            'correlation with WER': corr_wer
        }
        cer_result = {
            'CER(Districts)': Districts[i],
            'NISQA metrics': metrics[j],
            'correlation with CER': corr_cer
        }
        
        # Append the dictionary to the results list
        wer_results.append(wer_result)
        cer_results.append(cer_result)

wer_results_df = pd.DataFrame(wer_results)
cer_results_df = pd.DataFrame(cer_results)
cer_results_df.head(20)

In [None]:
correlation_matrix = wer_results_df.pivot(index='WER(Districts)', columns='NISQA metrics', values='correlation with WER')

correlation_matrix.to_csv('Correlation_matrix_wer_NISQA.csv')
correlation_matrix

In [None]:
plt.figure(figsize=(6, 4))  # Optional: Adjust the figure size
heatmap = sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm_r', fmt='.2f')

# Customize the plot
heatmap.set_xlabel('')
heatmap.set_ylabel('')

plt.savefig('/kaggle/working/Correlation_Heatmap_wer_NISQA.pdf',bbox_inches='tight',dpi=100)
plt.show()

In [None]:
correlation_matrix = cer_results_df.pivot(index='CER(Districts)', columns='NISQA metrics', values='correlation with CER')

correlation_matrix.to_csv('Correlation_matrix_cer_NISQA.csv')
correlation_matrix

In [None]:
plt.figure(figsize=(6, 4))  # Optional: Adjust the figure size
heatmap = sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm_r', fmt='.2f')

# Customize the plot
heatmap.set_xlabel('')
heatmap.set_ylabel('')

plt.savefig('/kaggle/working/Correlation_Heatmap_cer_NISQA.pdf',bbox_inches='tight',dpi=100)
plt.show()