In [None]:
import os
import sys
import math
import numpy as np
import pandas as pd

import librosa
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from collections import Counter

# load other modules --> repo root path
sys.path.insert(0, "../")

from utils import text
from utils import audio
from utils.logging import Logger
from params.params import Params as hp
from dataset.dataset import TextToSpeechDataset, TextToSpeechDatasetCollection, TextToSpeechCollate

### Load dataset and prepare data

In [None]:
hp.sample_rate = 22050
hp.languages = ["german", "dutch", "french", "greek", "japanese", "russian", "chinese", "finnish", "german", "hungarian", "spanish"]

common = ' '
greece = 'άέήίαβγδεζηθικλμνξοπρςíστυφχψωόύώ'
russian = 'абвгдежзийклмнопрстуфхцчшщъыьэюяё'

asciis = 'abcdefghijklmnopqrstuvwxyz'
chinese = 'ōǎǐǒàáǔèéìíūòóùúüāēěī'             
finnish = 'éöä'                               
german = 'ßàäéöü'                             
hungarian = 'őáéíóöűúü'
french = 'àâçèéêíôùû'
spanish = 'áèéíñóöúü'

hp.characters = ''.join(set(common + greece + russian + asciis + chinese + finnish + german + hungarian + french + spanish))

In [None]:
hp.predict_linear = True
hp.num_fft = 1102
metafile = "all_reduced.txt"
dataset_root = "../data/css10" 
data = TextToSpeechDataset(os.path.join(dataset_root, metafile), dataset_root)

In [None]:
durations = []
lengths = []
num_words = []
lengths_phon = []
languages = []
freq_chars = {l: Counter() for l in hp.languages}
freq_phon = {l: Counter() for l in hp.languages}

Logger.progress(0, prefix='Computing stats:')
for i, item in enumerate(data.items): 
    
    languages.append(hp.languages[item['language']])
    
    audio_path = item['audio']
    full_audio_path = os.path.join(dataset_root, audio_path)
    waveform = audio.load(full_audio_path)
    durations.append(audio.duration(waveform))
        
    utterance = text.to_text(item['text'], use_phonemes=False)
    clear_utterance = text.remove_punctuation(utterance)
    clear_words = clear_utterance.split()    
    lengths.append(len(utterance))
    num_words.append(len(clear_words))
    
    clear_utterance = clear_utterance.replace(' ', '')
    freq_chars[hp.languages[item['language']]].update(clear_utterance)
     
    utterance_pho = text.to_text(item['phonemes'], use_phonemes=True)
    lengths_phon.append(len(utterance_pho))
    utterance_pho = utterance_pho.replace(' ', '')
    utterance_pho = text.remove_punctuation(utterance_pho)
    freq_phon[hp.languages[item['language']]].update(utterance_pho)
    
    Logger.progress((i + 1) / len(data.items), prefix='Computing stats:')

## Item from data

In [None]:
item = data.items[0]

audio_path = item['audio']
full_audio_path = os.path.join(dataset_root, audio_path)
waveform = audio.load(full_audio_path)

print(item['text'])
print(text.to_text(item['text'], False))
print(text.to_text(item['phonemes'], True))
print(audio.duration(waveform))

melspec = audio.mel_spectrogram(waveform)
spec = audio.spectrogram(waveform)

# Data analysis

In [None]:
sns.set(rc={'figure.figsize':(16,4)})
sns.set_style("white")

In [None]:
df = pd.DataFrame({'Words'      :pd.Series(num_words, dtype='int'),
                   'Length'     :pd.Series(lengths, dtype='int'),
                   'Duration'   :pd.Series(durations, dtype='float'),
                   'LengthPhon' :pd.Series(lengths_phon, dtype='int'),
                   'Language'   :pd.Series(languages, dtype='category')},
                   columns=['Words', 'Length', 'Duration', 'LengthPhon', 'Language'])

In [None]:
print(len(df))
df = df[df['Length'] < 190]
print(len(df))
df = df[df['Duration'] < 10.1]
print(len(df))
df = df[df['Duration'] > 0.5]
print(len(df))
df = df[df['Length'] > 2]
print(len(df))

In [None]:
total = pd.DataFrame()
for name, group in df.groupby('Language'):
    #group_mean = df.groupby("Length").mean()
    #group_mean = group_mean.loc[df['Length']].reset_index()["Duration"]
    
    lr = linear_model.LinearRegression().fit(group['Length'].values.reshape(-1,1), group['Duration'].values.reshape(-1,1))
    group_mean = lr.predict(np.array(group['Length']).reshape(-1,1)).squeeze(-1)

    group_std = group.groupby("Length").std()
    group_std = group_std.loc[group['Length']]["Duration"]
    group_std.index = group.index
    
    m = group[(abs(group['Duration'] - group_mean) < np.log10(group['Length'])+1)] # & (abs(group['Duration'] - group_mean) - 3 * group_std < 0)
    total = pd.concat([m, total])
    
df = total
print(len(df))

In [None]:
# out_file = "idxes_clean.txt"
# with open(os.path.join(dataset_root, out_file), mode='w') as f:
#     for i in sorted(df.index):
#         print(f'{i}'.zfill(6), file=f)

# join -t '|' idxes_clean.txt a.txt > b.txt

### Duration distribution

In [None]:
for name, group in df.groupby('Language'):
    print(f'{name}:\t{sum(group["Duration"])/3600}')
print(f'Total:\t{sum(df["Duration"])/3600}')

In [None]:
for name, group in df.groupby('Language'):
    print(f'Min duration: {min(group["Duration"])}')
    print(f'Max duration: {max(group["Duration"])}')
    ax = sns.distplot(group['Duration'], hist=True, rug=False, fit=stats.norm, color="c", kde_kws={"color": "b", "lw": 3}, fit_kws={"color": "r", "lw": 3})
    ax.set(xlabel='Duration (s)', title=name);
    plt.show()

###  Length distribution

In [None]:
for name, group in df.groupby('Language'):
    print(f'Min length: {min(group["Length"])}')
    print(f'Max length: {max(group["Length"])}')
    ax = sns.distplot(group['Length'], kde=True, rug=False, fit=stats.norm, color="c", kde_kws={"color": "b", "lw": 3}, fit_kws={"color": "r", "lw": 3})
    ax.set(xlabel='Length', title=name);
    plt.show()

### Word count distribution

In [None]:
ax = sns.distplot(df['Words'], kde=True, rug=False, fit=stats.norm, color="c", kde_kws={"color": "b", "lw": 3}, fit_kws={"color": "r", "lw": 3})
ax.set(xlabel='Word count');

### Phonemized length distribution

In [None]:
ax = sns.distplot(df['LengthPhon'], kde=True, rug=False, fit=stats.norm, color="c", kde_kws={"color": "b", "lw": 3}, fit_kws={"color": "r", "lw": 3})
ax.set(xlabel='Phonemized length');

### Duration vs Length

In [None]:
for name, group in df.groupby('Language'):
    ax = sns.jointplot(group['Length'], group['Duration'], kind="hex", space=0, color="b")
    ax.fig.set_figwidth(7)
    ax.ax_joint.set(xlabel='Length', ylabel='Duration', title=name);

In [None]:
sns.set_style("whitegrid")

In [None]:
ax = sns.relplot(x="Length", y="Duration", kind="line", ci="sd", linewidth=3, data=df)
ax.fig.set_figwidth(15)
ax.fig.set_figheight(4)
ax.set(yticks=np.arange(round(min(df['Duration'])), max(df['Duration']) + 1,2))
plt.ylim(min(df['Duration']) - 1, max(df['Duration']) + 1);

In [None]:
sns.set_style("white")

### Duration vs Phonemized length

In [None]:
ax = sns.jointplot(df['LengthPhon'], df['Duration'], kind="hex", space=0, color="b")
ax.fig.set_figwidth(7)
ax.ax_joint.set(xlabel='Word count', ylabel='Duration');

In [None]:
sns.set_style("whitegrid")

In [None]:
ax = sns.relplot(x="LengthPhon", y="Duration", kind="line", ci="sd", linewidth=3, data=df)
ax.fig.set_figwidth(15)
ax.fig.set_figheight(4)
ax.set(yticks=np.arange(round(min(df['Duration'])), max(df['Duration']) + 1, 2))
plt.ylim(min(durations) - 1, max(df['Duration']) + 1);

In [None]:
sns.set_style("white")

### Phonemes distribution

In [None]:
symbols_phon = hp.phonemes.replace(' ', '')
symbols_phon

In [None]:
for k, v in freq_phon.items():
    sk = sorted(v.keys())
    g = sns.barplot(x=list(sk), y=[v[x] for x in sk]).set_title(k)
    plt.show()

In [None]:
total = Counter()
for k, v in freq_phon.items():
    total.update(v)
sns.barplot(x=list(symbols_phon), y=[total[x] for x in symbols_phon]).set_title("Total")
plt.show()

In [None]:
''.join(list(total.keys()))