# Imports

In [1]:
import json
from pandas.io.json import json_normalize

import os
import regex as re
import string
import itertools

import numpy as np
from numpy import mean
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

import jiwer
from jiwer import wer

import pickle

# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)
# pd.set_option('display.max_colwidth', -1)

In [2]:
print(os.getcwd())
os.chdir('/Users/traceyetheridge/Documents/Project/Git_Shared')

/Users/traceyetheridge/Documents/Project/Git_Shared/10_cleaning


In [3]:
# own functions, placed in functions.py.
from functions import *

# Import dataframe with original data

In [4]:
df = pd.read_pickle("./data/df_raw_new.pkl")
df.shape 
df_original = df.copy()

# How many unique references texts are there

In [5]:
unique = df['identifier'].unique()
len(unique)

31743

# Clean reference texts and hypotesis texts
 - initial volume: 665,328
 - after basic cleaning of ref and hyp: needsupdate(402.992)
 - after dropping Switchboard: needsudpate(298.896)

In [6]:
import regex as re

#------------------------------------
#       BASIC CLEANING
#------------------------------------

#lowercase both reference and hypothesis
df['reference.text'] = df['reference.text'].str.lower()
df['hypothesis.text'] = df['hypothesis.text'].str.lower()

#replace t_v_, t_v_s,  i_d_ with tv, tvs, id
df['reference.text'] = df['reference.text'].str.replace('t_v_', 'tv')
df['reference.text'] = df['reference.text'].str.replace('t_v_s', 'tvs')
df['reference.text'] = df['reference.text'].str.replace('i_d_', 'id')

#remove from reference:, ", [, ], {, }
remove_chars_list = [':', '"', '{', '}', '[', ']', '$']
pattern_remove = '|'.join(['({})'.format(re.escape(c)) for c in remove_chars_list])
df['reference.text'] = df['reference.text'].str.replace(pattern_remove, '')

# remove leading, trailing, multiple spaces
df['reference.text'] = df['reference.text'].apply(lambda x: re.sub('\s+', ' ', x).strip())
df['hypothesis.text'] = df['hypothesis.text'].apply(lambda x: re.sub('\s+', ' ', x).strip())

# remove sentences with <3 words
df = df[df['reference.text'].apply(lambda x: x.count(" ") >= 2)]

# remove sentences with empty hypothesis texts
df = df[df['hypothesis.text'] != ""]

#------------------------------------
#       DROP CORPUS'
#------------------------------------

#df = df[df['corpus'] != 'switchboard_segmented']
df = df[df['corpus'] != 'tedlium_unsegmented']
df = df[df['corpus'] != 'rt_segmented_hsum']
df = df[df['corpus'] != 'ami_segmented_mix']

#------------------------------------
#       DROP NON-LEXICAL SOUND ONLY TEXTS
#------------------------------------

df = df[(df['reference.only_non_lexical_sounds'] == False)] 

#----------------------------------------------
#       DROP TEXTS WITH MORE THAN 1 SPEAKER
#----------------------------------------------
ref_texts_counts = pd.DataFrame(df['reference.text'].value_counts()).reset_index()
ref_texts_counts.columns = ['reference.text', 'counts']
unique_refs_1_speaker = ref_texts_counts[ref_texts_counts['counts']<=16]['reference.text'].unique()
df = df[df['reference.text'].isin(unique_refs_1_speaker)]

#----------------------------------------------
#       KEEP COLUMNS OF INTEREST
#----------------------------------------------

keep_columns = ['identifier', 'speaker_id', 'file', 'corpus', 'configuration', 'machine', 'reference.text', 'hypothesis.text', 'scoring.wer']
df = df[keep_columns]

# From where did we loose data after cleaning?

In [7]:
df_before_cleaning = df_original['corpus'].value_counts()
df_after_cleaning = df['corpus'].value_counts()

print('Total rows before cleaning: ',len(df_original))
print('Total rows after cleaning: ',len(df))

df_comparison = pd.concat([df_before_cleaning, df_after_cleaning], axis=1, sort=False)
df_comparison.columns = ['before', 'after']
df_comparison['lost'] = ((df_comparison['before'] - df_comparison['after'])/df_comparison['before']).round(2)
df_comparison.sort_values('before', ascending = False)

Total rows before cleaning:  665328
Total rows after cleaning:  333459


Unnamed: 0,before,after,lost
rt_segmented_h,85184,49740.0,0.42
rt_segmented_hsum,85184,,
ami_segmented_h,73008,37625.0,0.48
ami_segmented_mix,72256,,
switchboard_segmented,65680,34141.0,0.48
commonvoice,63920,23941.0,0.63
librispeech_other,47024,46686.0,0.01
voxforge,46864,35233.0,0.25
librispeech_clean,41920,41582.0,0.01
st,38752,38266.0,0.01


In [8]:
# check if empty reference and hypthosis texts still present.
print(len(df_original[df_original['reference.text'] == ""]))
print(len(df[df['reference.text'] == ""]))

print(len(df_original[df_original['hypothesis.text'] == ""]))
print(len(df[df['hypothesis.text'] == ""]))

21856
0
79488
0


# Order data by mean wer by configuration (over all corpora)

In [25]:
# get order of machines (given by avg wer recalculated over all corpora)
mean_wer_agg = df.groupby(['configuration']) \
       .agg(count=('scoring.wer', 'size'), mean_wer=('scoring.wer', 'mean')) \
       .reset_index().sort_values('mean_wer')

order = mean_wer_agg['configuration'].values

df['configuration'] = pd.Categorical(df['configuration'],categories=order)
df = df.sort_values('configuration')
df.reset_index(inplace=True)

# Export clean and enriched data frame

In [26]:
df.to_pickle("./data/df_clean_newwer_new.pkl")

In [27]:
df.head()

Unnamed: 0,index,identifier,speaker_id,file,corpus,configuration,machine,reference.text,hypothesis.text,scoring.wer
0,1,sw2061A-ms98-a-0123,1167,amazon__8000_8__switchboard_segmented.json,switchboard_segmented,amazon__8000_8,amazon,yeah because see what happens is they have a g...,yeah because see what happens is they have a g...,0.0
1,6,sw4522A-ms98-a-0011,1601,amazon__8000_8__switchboard_segmented.json,switchboard_segmented,amazon__8000_8,amazon,they treat them they treat them just utterly h...,they treat them they treat them just utterly h...,0.105263
2,8,sw2335B-ms98-a-0086,1175,amazon__8000_8__switchboard_segmented.json,switchboard_segmented,amazon__8000_8,amazon,stuffed them in the freezer and then shot them...,stuff the minute freezing they shot them all,0.6
3,9,sw2243B-ms98-a-0091,1130,amazon__8000_8__switchboard_segmented.json,switchboard_segmented,amazon__8000_8,amazon,um it sounds like i'm wise,um it sounds like i'm wise,0.0
4,10,sw2923A-ms98-a-0053,1059,amazon__8000_8__switchboard_segmented.json,switchboard_segmented,amazon__8000_8,amazon,i realize there's just a lot of a lot of thing...,i realize there's just a lot of a lot of thing...,0.038462


In [28]:
# Check file
df = pd.read_pickle("/Users/traceyetheridge/Documents/Project/Git_Shared/data/df_clean_newwer_new.pkl")
df

Unnamed: 0,index,identifier,speaker_id,file,corpus,configuration,machine,reference.text,hypothesis.text,scoring.wer
0,1,sw2061A-ms98-a-0123,1167,amazon__8000_8__switchboard_segmented.json,switchboard_segmented,amazon__8000_8,amazon,yeah because see what happens is they have a g...,yeah because see what happens is they have a g...,0.000000
1,6,sw4522A-ms98-a-0011,1601,amazon__8000_8__switchboard_segmented.json,switchboard_segmented,amazon__8000_8,amazon,they treat them they treat them just utterly h...,they treat them they treat them just utterly h...,0.105263
2,8,sw2335B-ms98-a-0086,1175,amazon__8000_8__switchboard_segmented.json,switchboard_segmented,amazon__8000_8,amazon,stuffed them in the freezer and then shot them...,stuff the minute freezing they shot them all,0.600000
3,9,sw2243B-ms98-a-0091,1130,amazon__8000_8__switchboard_segmented.json,switchboard_segmented,amazon__8000_8,amazon,um it sounds like i'm wise,um it sounds like i'm wise,0.000000
4,10,sw2923A-ms98-a-0053,1059,amazon__8000_8__switchboard_segmented.json,switchboard_segmented,amazon__8000_8,amazon,i realize there's just a lot of a lot of thing...,i realize there's just a lot of a lot of thing...,0.038462
...,...,...,...,...,...,...,...,...,...,...
333454,4049,sw2549B-ms98-a-0070,1211,sphinx4__am_generic_ptm__lm_generic_p__8000_8_...,switchboard_segmented,sphinx4__am_generic_ptm__lm_generic_p__8000_8,sphinx4,yeah i've i've never liked ph philadelphia the...,you get hooked,1.000000
333455,4046,sw3573B-ms98-a-0021,1402,sphinx4__am_generic_ptm__lm_generic_p__8000_8_...,switchboard_segmented,sphinx4__am_generic_ptm__lm_generic_p__8000_8,sphinx4,uh-huh yours is a four door,up,1.000000
333456,4044,sw4367A-ms98-a-0039,1554,sphinx4__am_generic_ptm__lm_generic_p__8000_8_...,switchboard_segmented,sphinx4__am_generic_ptm__lm_generic_p__8000_8,sphinx4,that that i want even though i may not they ma...,up,1.000000
333457,3863,sw3466B-ms98-a-0077,1441,sphinx4__am_generic_ptm__lm_generic_p__8000_8_...,switchboard_segmented,sphinx4__am_generic_ptm__lm_generic_p__8000_8,sphinx4,back in and they were from up north i guess it...,what,1.000000
