In [2]:
import jiwer
import pandas as pd
import altair as alt
import os
import re
import sys

In [3]:
test = pd.read_csv("iconect/models/pt_only_all_utt_wgt_transfer/data/val.csv")
test.duration.describe()

count    628.000000
mean       7.038217
std       11.973328
min        1.000000
25%        2.000000
50%        3.000000
75%        8.000000
max      152.000000
Name: duration, dtype: float64

In [4]:
def info_extraction(df):
    wer = []
    ptid = []
    text = []
    asr = []
    duration = []
    for index, row in df.iterrows():
        text.append(row.text)
        asr.append(row.asr)
        wer.append(jiwer.wer(row.text, row.asr))
        ptid.append(row.uttid.split("_")[0] )
        duration.append(len(row.text))
    df["wer"] = wer
    df["ptid"] = ptid
    df["text"] = text
    df["asr"] = asr
    df["duration"] = duration
   

In [7]:
orig_long = pd.read_pickle("./iconect/models/pt_only_all_utt_sliced4/decoded/deepspeech_val_lr7e-06-bias_200121-19:01:32.pkl")
info_extraction(orig_long)
orig_long = orig_long[orig_long.uttid.str.contains('Participant', na=False)]
orig_long.wer.describe()

count    282.000000
mean       0.327052
std        0.188165
min        0.000000
25%        0.166667
50%        0.312500
75%        0.470588
max        1.000000
Name: wer, dtype: float64

In [39]:
orig_long = pd.read_pickle("test/slice4_200117_model_on_sliced4test.csv")
info_extraction(orig_long)
test1 = orig_long# [orig_long.uttid.str.contains('Participant', na=False)]
test1.wer.describe()

count    435.000000
mean       0.233239
std        0.172522
min        0.000000
25%        0.111111
50%        0.200000
75%        0.333333
max        0.823529
Name: wer, dtype: float64

In [42]:
# current limitation is 2 seconds
test3 = pd.read_pickle("test/orig_model_on_testtest.csv")
info_extraction(test3)
# test2 = test2[orig_long.uttid.str.contains('Participant', na=False)]
test3.wer.describe()

count    551.000000
mean       0.371677
std        0.239697
min        0.000000
25%        0.185185
50%        0.333333
75%        0.500000
max        1.000000
Name: wer, dtype: float64

In [40]:
test2 = pd.read_pickle("test/slice4_200117_model_on_testtest.csv")
info_extraction(test2)
# test2 = test2[orig_long.uttid.str.contains('Participant', na=False)]
test2.wer.describe()

count    435.000000
mean       0.233239
std        0.172522
min        0.000000
25%        0.111111
50%        0.200000
75%        0.333333
max        0.823529
Name: wer, dtype: float64

In [10]:
train_df = pd.read_csv("./iconect/models/pt_only_all_utt_sliced4/data/train_short.csv")
print(len(train_df))
len(train_df[train_df.uttid.str.contains('Participant', na=False)])

18158


6636

In [15]:
orig_short = pd.read_pickle("iconect/models/pt_only_all_utt/deepspeech_val_gridsearch_191228-09:16:21-lr4.9999999999999996e-06-bias.pkl")
info_extraction(orig_short)
orig_short.wer.describe()

count    522.000000
mean       0.327523
std        0.205210
min        0.000000
25%        0.184454
50%        0.296296
75%        0.443627
max        1.000000
Name: wer, dtype: float64

In [16]:
tuned_long = pd.read_pickle("iconect/models/pt_only_all_utt_sliced4/decoded/deepspeech_test_long_alpha2.pkl")
info_extraction(tuned_long)
tuned_long.wer.describe()

count    304.000000
mean       0.265638
std        0.190360
min        0.000000
25%        0.129032
50%        0.214286
75%        0.363636
max        1.000000
Name: wer, dtype: float64

In [38]:
sum(pd.read_csv("iconect/models/pt_only_all_utt_sliced4/data/test_short.csv").duration > 3)

434

In [17]:
tuned_short = pd.read_pickle("iconect/models/pt_only_all_utt_sliced4/decoded/deepspeech_test_short_alpha2.pkl")
info_extraction(tuned_short)
tuned_short.wer.describe()

count    435.000000
mean       0.253337
std        0.185379
min        0.000000
25%        0.117647
50%        0.214286
75%        0.357143
max        1.000000
Name: wer, dtype: float64

In [9]:
comparing_short = pd.DataFrame.from_dict({"uttid": orig_short.uttid, "ptid":tuned_short.ptid, "wer_diff": list(orig_short.wer - tuned_short.wer)})
comparing_long = pd.DataFrame.from_dict({"uttid": orig_long.uttid, "ptid":tuned_long.ptid, "wer_diff": list(orig_long.wer - tuned_long.wer)})

In [10]:
def pt_describe(df, colname: str):
    result = []
    pts = list(set(df.ptid))
    for ptid in pts:
        result.append(df[df.ptid == ptid][colname].describe().to_frame().transpose().rename(index={colname: ptid}))
    
    result = pd.concat(result)
    return result

In [11]:
ptid_wer_diff = pt_describe(comparing_long, colname="wer_diff")
alt.Chart(ptid_wer_diff.reset_index()).mark_bar().encode(
    x = "index",
    y = "mean",
    color = "count")

In [12]:
ptid_wer_diff = pt_describe(comparing_short, colname="wer_diff")
alt.Chart(ptid_wer_diff.reset_index()).mark_bar().encode(
    x = "index",
    y = "mean",
    color = "count")

In [11]:
orig_describe = pt_describe(orig, colname="wer")
tuned_describe = pt_describe(tuned, colname="wer")

orig_describe["model"] = "orig"
tuned_describe["model"] = "tuned"

describe = pd.concat([orig_describe, tuned_describe])
plt = alt.Chart(describe.reset_index()).mark_bar().encode(
                     x = "index",
                     y = "mean",
                     color = "count",
                     row = "model")
plt

In [8]:
orig_describe.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,model
C1106,14.0,0.531402,0.227567,0.181818,0.413961,0.563492,0.682292,0.857143,orig
PCON1008,3.0,0.521212,0.422638,0.2,0.281818,0.363636,0.681818,1.0,orig
P2CON1008,7.0,0.378755,0.164264,0.177778,0.28125,0.384615,0.419444,0.6875,orig
P2CON2001,4.0,0.383175,0.165312,0.173913,0.293478,0.406667,0.496364,0.545455,orig
PCON1019,52.0,0.316655,0.180056,0.058824,0.186127,0.285714,0.382488,0.909091,orig


In [27]:
tuned_long.loc[4].text

'we had some big jellyfish in victoria you know this size'

In [20]:
tuned.wer.describe()

count    522.000000
mean       0.315936
std        0.201840
min        0.000000
25%        0.176471
50%        0.280625
75%        0.428571
max        1.000000
Name: wer, dtype: float64

In [11]:
orig_describe["mean"].mean()

0.39913614963819727

In [12]:
tuned_describe["mean"].mean()

0.3412366870633728

## Audio situation

In [23]:
from collections import defaultdict
filenames=[]
for pwd, _, fs in os.walk("iconect/csvs/"):
    filenames.extend(fs)
    break

    
# extract pts with more than one transcripts
pt_trans = defaultdict(list)
for i in filenames:
    ptid = i.split("_")[0]
    pt_trans[ptid].append(i)
    
    
df_trans = pd.DataFrame.from_dict({"pdid": list(pt_trans.keys()), "trans_files": list(pt_trans.values())})
df_trans["num_files"] = [len(i) for i in df_trans.trans_files] 


data_pieces = []
for index, val in df_trans.iterrows():
    for filename in val.trans_files:
        temp = pd.read_csv(os.path.join("iconect","csvs", filename))
        data_pieces.append(temp)
        

samples = pd.concat(data_pieces)
samples

Unnamed: 0,uttid,st,et,text,identity,mather_audio_path,audio_prefix,audio_filepath,duration
0,C1106_VC_3_FullCon_Wk01_Day3_100919_Participant_0,0,1,pretty good,Participant,/home/chliu/iconect_data/analysis/data/C1106/W...,C1106_VC_3_FullCon_Wk01_Day3_100919,iconect/seg_audio/C1106_VC_3_FullCon_Wk01_Day3...,1
1,C1106_VC_3_FullCon_Wk01_Day3_100919_Moderator_1,1,4,good you ready to get started today,Moderator,/home/chliu/iconect_data/analysis/data/C1106/W...,C1106_VC_3_FullCon_Wk01_Day3_100919,iconect/seg_audio/C1106_VC_3_FullCon_Wk01_Day3...,3
2,C1106_VC_3_FullCon_Wk01_Day3_100919_Participant_2,4,5,yes,Participant,/home/chliu/iconect_data/analysis/data/C1106/W...,C1106_VC_3_FullCon_Wk01_Day3_100919,iconect/seg_audio/C1106_VC_3_FullCon_Wk01_Day3...,1
3,C1106_VC_3_FullCon_Wk01_Day3_100919_Moderator_3,5,12,okay well my name is debra and i'm part of the...,Moderator,/home/chliu/iconect_data/analysis/data/C1106/W...,C1106_VC_3_FullCon_Wk01_Day3_100919,iconect/seg_audio/C1106_VC_3_FullCon_Wk01_Day3...,7
4,C1106_VC_3_FullCon_Wk01_Day3_100919_Moderator_5,12,25,okay all right we'll go ahead i'll share my sc...,Moderator,/home/chliu/iconect_data/analysis/data/C1106/W...,C1106_VC_3_FullCon_Wk01_Day3_100919,iconect/seg_audio/C1106_VC_3_FullCon_Wk01_Day3...,13
...,...,...,...,...,...,...,...,...,...
251,P2CON2006_VC_3_FullCon_Wk01_Day3_042518_Modera...,1536,1537,mm hmm,Moderator,/home/chliu/iconect_data/analysis/data/P2CON20...,P2CON2006_VC_3_FullCon_Wk01_Day3_042518,iconect/seg_audio/P2CON2006_VC_3_FullCon_Wk01_...,1
252,P2CON2006_VC_3_FullCon_Wk01_Day3_042518_Partic...,1537,1547,and um and uh and and the things that he that ...,Participant,/home/chliu/iconect_data/analysis/data/P2CON20...,P2CON2006_VC_3_FullCon_Wk01_Day3_042518,iconect/seg_audio/P2CON2006_VC_3_FullCon_Wk01_...,10
253,P2CON2006_VC_3_FullCon_Wk01_Day3_042518_Modera...,1547,1549,he felt comfortable there yeah,Moderator,/home/chliu/iconect_data/analysis/data/P2CON20...,P2CON2006_VC_3_FullCon_Wk01_Day3_042518,iconect/seg_audio/P2CON2006_VC_3_FullCon_Wk01_...,2
254,P2CON2006_VC_3_FullCon_Wk01_Day3_042518_Partic...,1549,1550,yes yes yes,Participant,/home/chliu/iconect_data/analysis/data/P2CON20...,P2CON2006_VC_3_FullCon_Wk01_Day3_042518,iconect/seg_audio/P2CON2006_VC_3_FullCon_Wk01_...,1


In [31]:
len(list(set(orig_describe.index)))

53

In [32]:
23/37

0.6216216216216216

In [27]:
# data = []
# for i in ["train.csv", "val.csv", "test.csv"]:
#     data.append(pd.read_csv(os.path.join("./iconect/models/pt_only_all_utt/", i)))
    
audio_duration = 0.0
df = samples[samples.identity == "Participant"]
for index, row in df.iterrows():
    audio_duration += float(row.duration)
        
audio_duration / (60*60)
# total recording 37h
# moderator 14
# participant 23

22.876944444444444