# Results from the Prototype Day in Thisted in August

## Imports

In [None]:
import sys; sys.path.insert(0, '..')
import pandas as pd
import plotly.express as px
import re
from jiwer import wer, mer, wil
from utils import get_project_root
import os

pd.set_option("display.max_columns", None) 

## Data

In [None]:
path = os.path.join(get_project_root(),'notebooks','prototype_testing_p2_jan.csv')

# Google translation
df = pd.read_csv(path, sep=';')
df = df[['Filenavn2','Transkribering3','Google2']] \
    .rename(columns={'Google2':'Google', 'Filenavn2':'Filnavn', 'Transkribering3':'Transkribering'})
df['Filnavn'] = df['Filnavn'].str.lower()

df.head(3)

In [None]:
# Remove punctuation
df['Transkribering'] = [re.sub(r'[\,\.\?]+','', x) for x in df['Transkribering']]
df['Google'] = [re.sub(r'[\,\.\?]+','', x) for x in df['Google']]

# Lowercase
df['Transkribering'] = df.Transkribering.str.lower()
df['Google'] = df['Google'].str.lower()

# Add column for computer ID
df['id'] = df['Filnavn'].astype(str).str[:5]
df.head(3)

## Word Error Rate

In [None]:
def add_columns(df, true_text, generated_text, speech_provider:str):

    df[f"sequence_matcher_{speech_provider}"] = df.apply(lambda x: SequenceMatcher(None, x[generated_text], x[true_text]).ratio(),axis=1)
    df[f"word_error_rate_{speech_provider}"] = df.apply(lambda x: wer(x[true_text], x[generated_text]), axis=1)
    df[f"match_error_rate_{speech_provider}"] = df.apply(lambda x: mer(x[true_text], x[generated_text]), axis=1)
    df[f"word_information_lost_{speech_provider}"] = df.apply(lambda x: wil(x[true_text], x[generated_text]), axis=1)

    return df

In [None]:
df = add_columns(df, 'Transkribering', 'Google', 'google')

print('Word Error Rate, Average:')
print('Google:',df['word_error_rate_google'].mean())

In [None]:
# WER for each participant
df[["word_error_rate_google",'id']].groupby('id').mean()

In [None]:
def plot_WER(df, speech_provider):
    visualization_df = df[["Filnavn",f"word_error_rate_{speech_provider}", 'id']]
    fig = px.scatter(visualization_df, x='id', y=f'word_error_rate_{speech_provider}', color='id',
                    template='plotly_dark', marginal_y='box',
                    labels={
                        "word_error_rate_google": f"Word Error Rate for {speech_provider}",
                        "name" : "id"
                    }
                    )
    fig.update_traces(marker=dict(size=12, opacity=0.5))
    fig.update_layout(legend=dict(
        font=dict(size=12, color='white'),
        bordercolor="White",
            borderwidth=2
    ))
    fig.show()

In [None]:
plot_WER(df, 'google')

In [None]:
fig = px.box(df[["Filnavn",f"word_error_rate_google", 'id']], x ='id', y='word_error_rate_google', points='all', color='id'
            ,template='plotly_dark', labels={
                    "word_error_rate_google": "Google Word Error Rate"
                }
            )
fig.update_layout(showlegend=False)
fig.show()

# Exploration of missing, matching and perceived words

In [None]:
def speech_to_text_analysis(df, speech_provider):
    
    # Finding matches
    df[f'matches_{speech_provider}'] = df.apply(lambda x: ' '.join([i for i in x['Transkribering'].split() if i in x[f'{speech_provider}'].split()]), axis=1)

    df[f'count_matches_{speech_provider}'] = df[f'matches_{speech_provider}'].str.split().str.len()
    df[f'percent_match_{speech_provider}'] = (df[f'count_matches_{speech_provider}'] / df['Transkribering'].str.split().str.len()).round(2)
    
    if speech_provider == 'Google':
        df[f'missing_words_{speech_provider}'] = df.apply(lambda x: [i for i in x['Transkribering'].split() if all(i not in x[c] for c in df.columns[2:3])], axis=1)

    df[f'perceived_words_{speech_provider}'] = df.apply(lambda x: [i for i in x[f'{speech_provider}'].split() if all(i not in x[c] for c in df.columns[1:2])], axis=1)

    df[f'missing_words_count_{speech_provider}'] = 0
    for i in df.index:
        df[f'missing_words_count_{speech_provider}'][i] = len(df[f'missing_words_{speech_provider}'][i])
    
    df[f'missing_%_{speech_provider}'] = (df[f'missing_words_count_{speech_provider}'] / df['Transkribering'].str.split().str.len()).round(2)

    return df

In [None]:
df = speech_to_text_analysis(df,'Google')

## Matches

In [None]:
maw_g = pd.DataFrame(df.matches_Google.str.split(expand=True).stack().value_counts()).reset_index().rename(columns={'index':'words', 0:'count'})
maw_g['type'] = 'Google'

In [None]:
print('Matching words:')
print('Google:',df['percent_match_Google'].mean())

In [None]:
fig = px.bar(
    maw_g,
    x='words',y='count',
    title='Matching words',
    template='plotly_dark',
    color='type', barmode='group'
)
fig.show()

## Missing words

In [None]:
print('Missing words:')
print('Google:',df['missing_%_Google'].mean())

In [None]:
fig = px.bar(
    df,
    y='missing_%_Google',
    text = df['missing_%_Google'],
    color='id',
    title='Missing words (Google)',
    template='plotly_dark'
)
fig.update_traces(textposition='outside')
fig.show()

In [None]:
print('mean score:', df['percent_match_Google'].mean())
fig = px.bar(
    df,
    y='percent_match_Google', 
    color='id',
    text = df['percent_match_Google'],
    title=' % matches with the transcription (Google)',
    template='plotly_dark'
)
fig.update_traces(textposition='outside')
fig.show()