In [60]:
!pip install levenshtein
#https://maxbachmann.github.io/Levenshtein/levenshtein.html#distance
!pip install g2p_en
#https://github.com/Kyubyong/g2p

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [61]:
import os
import pandas as pd
import numpy as np
import Levenshtein as lev
from g2p_en import G2p

In [62]:
class Evaluator():
    def __init__(self, pred_path, actual_path):
        self.pred_path = pred_path
        self.actual_path = actual_path
        super().__init__()

    @staticmethod
    def get_target_arpabet():
        '''Return ARPABET of the given paragraph'''
        TARGET_TEXT = '''Please call Stella. Ask her to bring these things with her from the store: Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob. We also need a small plastic snake and a big toy frog for the kids. She can scoop these things into three red bags, and we will go meet her Wednesday at the train station.'''
        ## Grapheme To Phoneme Conversion
        g2p = G2p()
        target_p = ''.join(g2p(TARGET_TEXT))
        ##print('Target arpabet is {}'.format(target_p))
        return target_p

    @staticmethod
    def calculate_distance(pred_p, target_p, weights=(1, 1, 1), score_cutoff=None):
        '''
        Levenshtein distance is from S1 to S2, here is from pred_p to target_p
        weights for the three operations in the form (insertion, deletion, substitution)
        If the distance is bigger than score_cutoff, score_cutoff + 1 is returned instead
        '''
        dist = lev.distance(pred_p, target_p, weights=weights, score_cutoff=score_cutoff)
        return dist

    @staticmethod
    def read_txt(input_file):
        with open(input_file, "r") as file:
            text = file.read()
        return text

    def process_files(self):
        res = []
        actual_file_list = os.listdir(self.actual_path)
        pred_file_list = os.listdir(self.pred_path)
        ###actual files can starting with Capital while pred files not, so use it for loop as a small trick
        for file_name in actual_file_list:
            print(file_name)
            one_row = {'file': file_name}
            target_p = self.get_target_arpabet()
            actual_p = self.read_txt(os.path.join(self.actual_path, file_name))
            dist_a = self.calculate_distance(actual_p, target_p)
            dist2_a = self.calculate_distance(actual_p, target_p, weights=(1, 1, 2))
            one_row.update({'adist': dist_a, 'adist2': dist2_a})
            if file_name.lower() in pred_file_list:
              pred_p = self.read_txt(os.path.join(self.pred_path, file_name.lower()))
              dist = self.calculate_distance(pred_p, target_p)
              dist2 = self.calculate_distance(pred_p, target_p, weights=(1, 1, 2))
              print(f'Processed file {file_name}, distance {dist} and distance 2 is {dist2}')
              one_row.update({'pdist': dist, 'pdist2': dist2})

            res.append(one_row)

        return pd.DataFrame(res)

    @staticmethod
    def save(df, file_path='res.csv'):
        df.to_csv(file_path, index=False)
        return


In [63]:
base_path = 'baseline_arpa/'
actual_path = 'arpa_transcripts/'
finetuned_path = 'finetuned_arpa/'

In [64]:
print(len(os.listdir(base_path)))
print(len(os.listdir(actual_path)))
print(len(os.listdir(finetuned_path)))

96
100
97


In [65]:
base_evator = Evaluator(base_path, actual_path)
finetuned_evator = Evaluator(finetuned_path, actual_path)

In [66]:
df_base = base_evator.process_files()

bulgarian1.txt
Processed file bulgarian1.txt, distance 24 and distance 2 is 30
mankanya1.txt
Processed file mankanya1.txt, distance 103 and distance 2 is 129
Mandarin1.txt
Processed file Mandarin1.txt, distance 60 and distance 2 is 86
Spanish3.txt
Processed file Spanish3.txt, distance 83 and distance 2 is 110
Portuguese3.txt
Processed file Portuguese3.txt, distance 75 and distance 2 is 100
Kirghiz1.txt
Processed file Kirghiz1.txt, distance 69 and distance 2 is 94
dari6.txt
Processed file dari6.txt, distance 37 and distance 2 is 48
danish2.txt
Processed file danish2.txt, distance 20 and distance 2 is 23
arabic17.txt
Processed file arabic17.txt, distance 43 and distance 2 is 58
arabic53.txt
Processed file arabic53.txt, distance 53 and distance 2 is 67
English55.txt
Processed file English55.txt, distance 19 and distance 2 is 23
greek2.txt
Processed file greek2.txt, distance 89 and distance 2 is 103
korean20.txt
Processed file korean20.txt, distance 20 and distance 2 is 25
tagalog5.txt
Pro

In [67]:
df_finetuned = finetuned_evator.process_files()

bulgarian1.txt
Processed file bulgarian1.txt, distance 121 and distance 2 is 182
mankanya1.txt
Processed file mankanya1.txt, distance 168 and distance 2 is 228
Mandarin1.txt
Processed file Mandarin1.txt, distance 104 and distance 2 is 165
Spanish3.txt
Processed file Spanish3.txt, distance 167 and distance 2 is 239
Portuguese3.txt
Processed file Portuguese3.txt, distance 173 and distance 2 is 240
Kirghiz1.txt
Processed file Kirghiz1.txt, distance 155 and distance 2 is 229
dari6.txt
Processed file dari6.txt, distance 129 and distance 2 is 195
danish2.txt
Processed file danish2.txt, distance 94 and distance 2 is 142
arabic17.txt
Processed file arabic17.txt, distance 110 and distance 2 is 173
arabic53.txt
Processed file arabic53.txt, distance 120 and distance 2 is 173
English55.txt
Processed file English55.txt, distance 108 and distance 2 is 168
greek2.txt
Processed file greek2.txt, distance 173 and distance 2 is 244
korean20.txt
Processed file korean20.txt, distance 100 and distance 2 is 

In [68]:
##df.to_csv('res.csv', index=False)

In [69]:
df_base['dist'] = df_base['pdist'] - df_base['adist']
df_base['dist2'] = df_base['pdist2'] - df_base['adist2']

df_finetuned['dist'] = df_finetuned['pdist'] - df_finetuned['adist']
df_finetuned['dist2'] = df_finetuned['pdist2'] - df_finetuned['adist2']

In [70]:
df_base.describe()

Unnamed: 0,adist,adist2,pdist,pdist2,dist,dist2
count,100.0,100.0,96.0,96.0,96.0,96.0
mean,139.24,208.7,57.5,75.760417,-78.53125,-128.802083
std,31.936528,39.489034,32.888328,45.055462,24.453588,36.36355
min,99.0,152.0,14.0,14.0,-119.0,-189.0
25%,119.75,184.75,31.0,39.5,-95.0,-153.25
50%,130.0,200.5,52.0,67.0,-83.0,-136.5
75%,151.75,229.0,71.25,95.25,-68.5,-111.0
max,310.0,422.0,175.0,232.0,31.0,20.0


In [71]:
df_finetuned.describe()

Unnamed: 0,adist,adist2,pdist,pdist2,dist,dist2
count,100.0,100.0,97.0,97.0,97.0,97.0
mean,139.24,208.7,128.051546,191.020619,-8.360825,-14.113402
std,31.936528,39.489034,24.7796,29.471024,12.063362,18.094587
min,99.0,152.0,94.0,142.0,-35.0,-57.0
25%,119.75,184.75,111.0,170.0,-16.0,-23.0
50%,130.0,200.5,121.0,187.0,-8.0,-14.0
75%,151.75,229.0,136.0,206.0,0.0,-3.0
max,310.0,422.0,213.0,283.0,20.0,38.0


In [72]:
def get_norm(y, ord=2):
  res = np.linalg.norm(y[~np.isnan(y)], ord=ord)
  return res

In [74]:
get_norm(df_base['dist'].values, ord=2)

805.5153629819856

In [75]:
get_norm(df_finetuned['dist'].values, ord=2)

144.05207391773297

In [76]:
get_norm(df_base['dist2'].values, ord=2)

1310.8230239052104

In [77]:
get_norm(df_finetuned['dist2'].values, ord=2)

225.28426487440262

In [78]:
from plotly.offline import iplot
import plotly.graph_objs as go
import plotly.express as px

In [79]:
df_base.head()

Unnamed: 0,file,adist,adist2,pdist,pdist2,dist,dist2
0,bulgarian1.txt,126,196,24.0,30.0,-102.0,-166.0
1,mankanya1.txt,183,250,103.0,129.0,-80.0,-121.0
2,Mandarin1.txt,127,210,60.0,86.0,-67.0,-124.0
3,Spanish3.txt,169,246,83.0,110.0,-86.0,-136.0
4,Portuguese3.txt,162,243,75.0,100.0,-87.0,-143.0


In [80]:
df_combined =  df_finetuned.merge(df_base, on=['file'], suffixes=('', '_base'))

In [81]:
df_combined.head()

Unnamed: 0,file,adist,adist2,pdist,pdist2,dist,dist2,adist_base,adist2_base,pdist_base,pdist2_base,dist_base,dist2_base
0,bulgarian1.txt,126,196,121.0,182.0,-5.0,-14.0,126,196,24.0,30.0,-102.0,-166.0
1,mankanya1.txt,183,250,168.0,228.0,-15.0,-22.0,183,250,103.0,129.0,-80.0,-121.0
2,Mandarin1.txt,127,210,104.0,165.0,-23.0,-45.0,127,210,60.0,86.0,-67.0,-124.0
3,Spanish3.txt,169,246,167.0,239.0,-2.0,-7.0,169,246,83.0,110.0,-86.0,-136.0
4,Portuguese3.txt,162,243,173.0,240.0,11.0,-3.0,162,243,75.0,100.0,-87.0,-143.0


In [82]:
df_finetuned['tag'] = 'finetuned'
df_base['tag'] = 'base'
df_all = df_finetuned.append(df_base)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [83]:
df_all.head()

Unnamed: 0,file,adist,adist2,pdist,pdist2,dist,dist2,tag
0,bulgarian1.txt,126,196,121.0,182.0,-5.0,-14.0,finetuned
1,mankanya1.txt,183,250,168.0,228.0,-15.0,-22.0,finetuned
2,Mandarin1.txt,127,210,104.0,165.0,-23.0,-45.0,finetuned
3,Spanish3.txt,169,246,167.0,239.0,-2.0,-7.0,finetuned
4,Portuguese3.txt,162,243,173.0,240.0,11.0,-3.0,finetuned


In [99]:
fig = px.scatter(df_all , x = 'adist', y = 'pdist', color='tag')

fig.update_xaxes(title='Actual distance')
fig.update_yaxes(title='Predicted distance')
fig.update_layout(xaxis_range=[90,230])

fig['layout']['legend']['title']['font_size'] = 20
fig['layout']['legend']['font_size']=20
fig['layout']['xaxis']['title']['font_size'] = 20
fig['layout']['yaxis']['title']['font_size'] = 20

fig.show()

In [104]:
fig2 = px.scatter(df_all , x = 'adist', y = 'dist', color='tag')

fig2.update_xaxes(title='Actual distance')
fig2.update_yaxes(title='Predicted - Actual')
fig2.update_layout(xaxis_range=[90,230])

fig2['layout']['legend']['title']['font_size'] = 20
fig2['layout']['legend']['font_size']=20
fig2['layout']['xaxis']['title']['font_size'] = 20
fig2['layout']['yaxis']['title']['font_size'] = 20

In [105]:
fig2.show()