# clean_dataset_handler

---
- For fine-tuning, sample files with odd pauses, or super quick speech should be exlcuded to avoid issues.
- Whipser also sometimes miss transcribes, and gives either no transcription or too much for a given audio file.
- This handler takes the metadata file and wavs set, then performs basic clean up. Including outlier detection for audio length by text transcription. 
- This handler will output a cleaned_metadata.csv and a stats_metadata.csv The stats metadata may be useful for trending typical variations in data. Not necessary, but added for future use cases.
---


In [33]:
from pydub import AudioSegment
import webvtt
import os
from scipy.stats import truncnorm, zscore
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

In [34]:
# Define dirs
datasetDir = "datasets/personal_voice/"
audioDir = f"{datasetDir}wavs/"
metadataFile = f"{datasetDir}metadata.csv"

In [35]:
# Read metadata file in as a pandas df to begin analysis
colNames = ["fileName", "text", "normalizedTest"]

df = pd.read_csv(metadataFile, sep="|", header=None, names=colNames)
df.head()

Unnamed: 0,fileName,text,normalizedTest
0,chunk_0000,The first time our father brought Andrea to t...,The first time our father brought Andrea to t...
1,chunk_0001,to my sister's room and told us to come downs...,to my sister's room and told us to come downs...
2,chunk_0002,Is it a work friend? Maeve asked. She was old...,Is it a work friend? Maeve asked. She was old...
3,chunk_0003,"question. I'd say not. Where's your brother?""...","question. I'd say not. Where's your brother?""..."
4,chunk_0004,Sandy had to pull the draperies back to find ...,Sandy had to pull the draperies back to find ...


In [36]:
# Add features to data to help with text issue detection later
df['textCharLength'] = df['text'].str.len()

# Check char length stats. Note XTTS cannot process char length > 250
MAX_CHAR = 250
df = df[df['textCharLength'] <= MAX_CHAR]

df.describe()

Unnamed: 0,textCharLength
count,2736.0
mean,103.205044
std,24.385283
min,10.0
25%,87.0
50%,102.0
75%,119.0
max,204.0


In [37]:
# Now need to get the length of the audio in ms from each file

# Read in all wav files
fileList = os.listdir(audioDir)

# Pair up each audio file with its length in ms
fileSizeSet = []

for file in fileList:
    # Get name without .wav
    fileName = file.split('.')[0]
    # Use pydub audiosegement to gets its length in ms
    audioFilePath = audioDir + file
    audioInMs = len(AudioSegment.from_wav(audioFilePath))
    # Add entry to main set
    fileSizeSet.append([fileName, audioInMs])

In [38]:
# Create new df
pairingCols = ['fileName', 'lengthMs']

pairingDf = pd.DataFrame(fileSizeSet, columns=pairingCols)

# Can filter for set hard low and high limits, but this should already be done when chunking the audio file...

pairingDf.describe()

Unnamed: 0,lengthMs
count,2737.0
mean,6554.928754
std,976.210324
min,3586.0
25%,5908.0
50%,6609.0
75%,7172.0
max,9674.0


In [39]:
# Merge df and pairingDf together to get length_ms paired with file data

statsDf = pd.merge(left=df, right=pairingDf, on='fileName', how='left')
statsDf.head()

Unnamed: 0,fileName,text,normalizedTest,textCharLength,lengthMs
0,chunk_0000,The first time our father brought Andrea to t...,The first time our father brought Andrea to t...,85,6757
1,chunk_0001,to my sister's room and told us to come downs...,to my sister's room and told us to come downs...,110,7286
2,chunk_0002,Is it a work friend? Maeve asked. She was old...,Is it a work friend? Maeve asked. She was old...,134,8030
3,chunk_0003,"question. I'd say not. Where's your brother?""...","question. I'd say not. Where's your brother?""...",71,5945
4,chunk_0004,Sandy had to pull the draperies back to find ...,Sandy had to pull the draperies back to find ...,86,6145


In [40]:
# Now we need to see which transcriptions have bad text-char-length to audio-lengths.
# Whipser is not perfect for transcription, and these bad files will harm our dataset
# To do this we will detect outliers based the ms per char

statsDf['msPerChar'] = (statsDf['lengthMs'])/statsDf['textCharLength']

statsDf.describe()

Unnamed: 0,textCharLength,lengthMs,msPerChar
count,2736.0,2736.0,2736.0
mean,103.205044,6554.908991,66.434974
std,24.385283,976.388226,22.371903
min,10.0,3586.0,33.102941
25%,87.0,5908.0,56.71794
50%,102.0,6609.0,63.745417
75%,119.0,7172.0,71.770521
max,204.0,9674.0,630.1


In [41]:
# We now have access to the typical seconds per char rate. 
# Based on my example, I see a max of 333 ms per char, and a min of  ~ 33 ms per char for a given transcription.
# With the average being about 69 ms per char
# For outlier detection, will use a simple z-score test
statsDf['zScore'] = zscore(statsDf['msPerChar'])

# To detect outliers, will set absolute z-score limit. For now will just standard default of 2
Z_SCORE_MAX = 2

statsDf['outlier'] = statsDf['zScore'].abs() > Z_SCORE_MAX
numOutliers = statsDf['outlier'].sum()
percentOutliers = (numOutliers/len(statsDf))*100
print(f"Percent outliers detect: {percentOutliers}%")


# Save this stats df as a csv file. Can use for trending later
statsFile = datasetDir + "stats_metadata.csv"
statsDf.to_csv(statsFile, index=False)

# Filter out outliers
statsDf = statsDf[statsDf['outlier'] == False]

# Check data stats again. This should reduce the variance in msPerChar 
# Without affecting the lengthMs too much
statsDf.describe()

Percent outliers detect: 1.0964912280701753%


Unnamed: 0,textCharLength,lengthMs,msPerChar,zScore
count,2706.0,2706.0,2706.0,2706.0
mean,103.883222,6558.471545,65.061653,-0.061397
std,23.590821,975.280787,11.592614,0.518272
min,37.0,3586.0,33.102941,-1.490178
25%,87.0,5908.0,56.563311,-0.441333
50%,102.0,6610.0,63.625824,-0.125589
75%,119.0,7173.0,71.486733,0.225849
max,204.0,9674.0,110.808511,1.983812


In [42]:
# Now need to filter out chunks from the metadata file that do not exist in the filtered statsDf
outputDf = pd.merge(left=df, right=statsDf, on='fileName', how='inner') # User inner join to filter out ones not in statsDf

# Filter df to only have the original colNames. Add _x since the join with statsDf will duplicate columns
adjustedColNames = ['fileName', 'text_x', 'normalizedTest_x']
outputDf = outputDf[adjustedColNames]

# Write to Csv in original XTTS metadata format
cleanedMetadataFile = datasetDir + "cleaned_metadata.csv"
outputDf.to_csv(cleanedMetadataFile, sep="|", index=False, header=False)
print("cleaned dataset created...")