# clean_dataset_handler

---
- For fine-tuning, sample files with odd pauses, or super quick speech should be exlcuded to avoid issues.
- Whipser also sometimes miss transcribes, and gives either no transcription or too much for a given audio file.
- This handler takes the metadata file and wavs set, then performs basic clean up. Including outlier detection for audio length by text transcription. 
- This handler will output a cleaned_metadata.csv and a stats_metadata.csv The stats metadata may be useful for trending typical variations in data. Not necessary, but added for future use cases.
---


In [20]:
from pydub import AudioSegment
import webvtt
import os
from scipy.stats import truncnorm, zscore
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

In [21]:
# Define dirs
datasetDir = "datasets/normalExample/"
audioDir = f"{datasetDir}wavs/"
metadataFile = f"{datasetDir}metadata.csv"

In [22]:
# Read metadata file in as a pandas df to begin analysis
colNames = ["fileName", "text", "normalizedTest"]

df = pd.read_csv(metadataFile, sep="|", header=None, names=colNames)
df.head()

Unnamed: 0,fileName,text,normalizedTest
0,chunk_0000,We are what we think. All that we are arises,We are what we think. All that we are arises
1,chunk_0001,"with our thoughts. With our thoughts, we make...","with our thoughts. With our thoughts, we make..."
2,chunk_0002,The Illustrious Buddha. This is my simple rel...,The Illustrious Buddha. This is my simple rel...
3,chunk_0003,"There is no need for temples, no need for com...","There is no need for temples, no need for com..."
4,chunk_0004,is our temple. The philosophy is kindness. Da...,is our temple. The philosophy is kindness. Da...


In [23]:
# Add features to data to help with text issue detection later
df['textCharLength'] = df['text'].str.len()

# Check char length stats. Note XTTS cannot process char length > 250
MAX_CHAR = 250
df = df[df['textCharLength'] <= MAX_CHAR]

df.describe()

Unnamed: 0,textCharLength
count,1460.0
mean,96.400685
std,21.107165
min,11.0
25%,83.0
50%,96.0
75%,111.0
max,214.0


In [24]:
# Now need to get the length of the audio in ms from each file

# Read in all wav files
fileList = os.listdir(audioDir)

# Pair up each audio file with its length in ms
fileSizeSet = []

for file in fileList:
    # Get name without .wav
    fileName = file.split('.')[0]
    # Use pydub audiosegement to gets its length in ms
    audioFilePath = audioDir + file
    audioInMs = len(AudioSegment.from_wav(audioFilePath))
    # Add entry to main set
    fileSizeSet.append([fileName, audioInMs])

In [25]:
# Create new df
pairingCols = ['fileName', 'lengthMs']

pairingDf = pd.DataFrame(fileSizeSet, columns=pairingCols)

# Can filter for set hard low and high limits, but this should already be done when chunking the audio file...

pairingDf.describe()

Unnamed: 0,lengthMs
count,1460.0
mean,6488.497945
std,1025.71017
min,2207.0
25%,6000.0
50%,7000.0
75%,7000.0
max,10000.0


In [26]:
# Merge df and pairingDf together to get length_ms paired with file data

statsDf = pd.merge(left=df, right=pairingDf, on='fileName', how='left')
statsDf.head()

Unnamed: 0,fileName,text,normalizedTest,textCharLength,lengthMs
0,chunk_0000,We are what we think. All that we are arises,We are what we think. All that we are arises,45,6000
1,chunk_0001,"with our thoughts. With our thoughts, we make...","with our thoughts. With our thoughts, we make...",65,6000
2,chunk_0002,The Illustrious Buddha. This is my simple rel...,The Illustrious Buddha. This is my simple rel...,52,6000
3,chunk_0003,"There is no need for temples, no need for com...","There is no need for temples, no need for com...",96,7000
4,chunk_0004,is our temple. The philosophy is kindness. Da...,is our temple. The philosophy is kindness. Da...,54,6000


In [27]:
# Now we need to see which transcriptions have bad text-char-length to audio-lengths.
# Whipser is not perfect for transcription, and these bad files will harm our dataset
# To do this we will detect outliers based the ms per char

statsDf['msPerChar'] = (statsDf['lengthMs'])/statsDf['textCharLength']

statsDf.describe()

Unnamed: 0,textCharLength,lengthMs,msPerChar
count,1460.0,1460.0,1460.0
mean,96.400685,6488.497945,69.338129
std,21.107165,1025.71017,15.733938
min,11.0,2207.0,32.71028
25%,83.0,6000.0,60.869565
50%,96.0,7000.0,66.666667
75%,111.0,7000.0,74.074074
max,214.0,10000.0,333.333333


In [28]:
# We now have access to the typical seconds per char rate. 
# Based on my example, I see a max of 333 ms per char, and a min of  ~ 33 ms per char for a given transcription.
# With the average being about 69 ms per char
# For outlier detection, will use a simple z-score test
statsDf['zScore'] = zscore(statsDf['msPerChar'])

# To detect outliers, will set absolute z-score limit. For now will just standard default of 2
Z_SCORE_MAX = 2

statsDf['outlier'] = statsDf['zScore'].abs() > Z_SCORE_MAX
numOutliers = statsDf['outlier'].sum()
percentOutliers = (numOutliers/len(statsDf))*100
print(f"Percent outliers detect: {percentOutliers}%")


# Save this stats df as a csv file. Can use for trending later
statsFile = datasetDir + "stats_metadata.csv"
statsDf.to_csv(statsFile, index=False)

# Filter out outliers
statsDf = statsDf[statsDf['outlier'] == False]

# Check data stats again. This should reduce the variance in msPerChar 
# Without affecting the lengthMs too much
statsDf.describe()

Percent outliers detect: 2.4657534246575343%


Unnamed: 0,textCharLength,lengthMs,msPerChar,zScore
count,1424.0,1424.0,1424.0,1424.0
mean,97.26264,6488.764045,67.926085,-0.089776
std,19.764598,1018.390358,9.580433,0.609111
min,43.0,3000.0,45.112782,-1.540215
25%,84.0,6000.0,60.869565,-0.53842
50%,96.0,7000.0,66.666667,-0.169848
75%,111.0,7000.0,73.684211,0.276318
max,166.0,10000.0,100.0,1.949441


In [32]:
# Now need to filter out chunks from the metadata file that do not exist in the filtered statsDf
outputDf = pd.merge(left=df, right=statsDf, on='fileName', how='inner') # User inner join to filter out ones not in statsDf

# Filter df to only have the original colNames. Add _x since the join with statsDf will duplicate columns
adjustedColNames = ['fileName', 'text_x', 'normalizedTest_x']
outputDf = outputDf[adjustedColNames]

# Write to Csv in original XTTS metadata format
cleanedMetadataFile = datasetDir + "cleaned_metadata.csv"
outputDf.to_csv(cleanedMetadataFile, sep="|", index=False, header=False)