# ASR: Sphinx

In [None]:
import os
import wget
import tarfile
import argparse
import csv
from multiprocessing.pool import ThreadPool
import subprocess

## Download data

In [None]:
target_dir = './'
os.makedirs(target_dir, exist_ok=True)

target_unpacked_dir = os.path.join(target_dir, "CV_unpacked")
os.makedirs(target_unpacked_dir, exist_ok=True)

if args.tar_path and os.path.exists(args.tar_path):
    print('Find existing file {}'.format(args.tar_path))
    target_file = args.tar_path
else:
    print("Could not find downloaded Common Voice archive, Downloading corpus...")
    filename = wget.download(COMMON_VOICE_URL, target_dir)
    target_file = os.path.join(target_dir, os.path.basename(filename))

print("Unpacking corpus to {} ...".format(target_unpacked_dir))
tar = tarfile.open(target_file)
tar.extractall(target_unpacked_dir)
tar.close()

## Data Preparation
1. Create ID files 
2. Create Transcription files
3. Convert to wav

In [None]:
wav_dir = os.path.join(target_dir, 'wav/')
etc_dir = os.path.join(target_dir, 'etc/')
dataset_name = os.path.basename(target_dir)
os.makedirs(wav_dir, exist_ok=True)
os.makedirs(etc_dir, exist_ok=True)
path_to_data = os.path.dirname(csv_file)

In [None]:
def create_fileids(csv_file, target_dir, dataset_type): 
    print('Creating fileids file for {}.'.format(csv_file))
    with open(csv_file) as csvfile:
        reader = csv.DictReader(csvfile)
        data = []
        with open(os.path.join(etc_dir, dataset_name + '_'+ dataset_type + '.fileids'), 'w') as ids_file:
            for row in reader: 
                file_name = row['filename']
                wav_path = os.path.join(wav_dir, dataset_type + '_'+ os.path.splitext(os.path.basename(file_name))[0] + '.wav')
                ids_file.write(os.path.basename(wav_path)[0] + '\n')

                data.append((file_name,wav_path))
        return data

In [None]:
def create_transcript_files(csv_file, target_dir, dataset_type): 
    print('Creating transcript files for {}.'.format(csv_file))
    with open(csv_file) as csvfile:
        reader = csv.DictReader(csvfile)
        with open(os.path.join(etc_dir, dataset_name + '_'+ dataset_type + '.transcription'), 'w') as trans_file:
            for row in reader: 
                file_name = row['filename']
                wav_path = os.path.join(wav_dir, dataset_type + '_'+ os.path.splitext(os.path.basename(file_name))[0] + '.wav')
                trans_file.write('<s> '+ row['text'] + ' </s> (' + os.path.basename(wav_path)[0] + ')' + '\n')

In [None]:
def convert_to_wav(x):
    file_path, wav_path = x
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    cmd = "sox {} -r {} -b 16 -c 1 {}".format(
        os.path.join(path_to_data, file_path),
        args.sample_rate,
        wav_path)
    subprocess.call([cmd], shell=True)

### Create training and validation data

In [None]:
# Format the training data
train_csv_file = os.path.join(target_unpacked_dir, 'cv_corpus_v1/', 'cv-valid-train.csv'
train_wav_files = create_fileids(train_csv_file,target_dir, 'train')
create_transcript_files(train_csv_file,target_dir, 'train')
with ThreadPool(10) as pool:
    pool.map(convert_to_wav, train_wav_files)

# Format the validation data
val_csv_file = os.path.join(target_unpacked_dir, 'cv_corpus_v1/', 'cv-valid-dev.csv')
val_wav_files = create_fileids(val_csv_file, target_dir, 'test')
create_transcript_files(val_csv_file, target_dir, 'test')
with ThreadPool(10) as pool:
    pool.map(convert_to_wav, val_wav_files)

### Create word list from the training data

In [None]:
import collections
import os

counter = collections.Counter()
with open(csv_file) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader: 
            trans = row['text']
            counter += collections.Counter(trans.split())

with open(os.path.join(etc_dir,'common_voice.words'), 'w') as f: 
    for item in counter:
        f.write(item.lower() + '\n')

### Create phonetic dictionary (lexicon) 
The phone list is the list of phones that will be used in the model. 
A default phone list is provided. 

A lexicon/phonetic dictionary can be obtained for each of the  words in the dataset by using: [CMU Lextool](http://www.speech.cs.cmu.edu/tools/lextool.html)

We provide one, to prevent some of the necessary formatting (convert to lower case, phone replacement to match phone list).

In [None]:
%%bash
cat etc/common_voice.phone

In [None]:
%%bash
head etc/common_voice.dic

### Create Language Model

In [None]:
!ls etc/common_voice_train_transcript_only.txt

In [None]:
%%bash
# Create vocab file
text2wfreq < etc/common_voice_train_transcript_only.txt | wfreq2vocab > etc/common_voice.vocab

# # Create n-gram count from training transcript file
text2idngram -vocab etc/common_voice.vocab -idngram etc/common_voice.idngram < etc/common_voice_train.transcription 

# # Create language model from n-grams
idngram2lm -vocab_type 0 -idngram etc/common_voice.idngram -vocab etc/common_voice.vocab -arpa etc/common_voice.lm

# # Convert language model to DMP format
!sphinx_lm_convert -i etc/common_voice.lm -o etc/common_voice.lm.DMP

In [None]:
%%bash
sphinx_lm_convert -i etc/common_voice.lm -o etc/common_voice.dmp

In [None]:
%%bash
head etc/common_voice.lm

### Pre-processing complete
Make sure we have all the files necessary.
```
$ls etc/

common_voice.dic      common_voice.vocab
common_voice.filler   common_voice_test.fileids
common_voice.idngram  common_voice_test.transcription
common_voice.lm       common_voice_train.fileids
common_voice.lm.bin   common_voice_train.transcription
common_voice.phone
```

In [None]:
!ls etc/

## Run Setup 
Creates the config for the training process

In [None]:
%%bash

sphinxtrain -t common_voice setup

## Train Sphinx Model
Becasue the Sphinx is CPU-based, the training on a large dataset can take a very long time. 

In [None]:
import subprocess, time, os, sys
cmd = ["sphinxtrain","run"]

p = subprocess.Popen(cmd,
                     stdout=subprocess.PIPE,
                     stderr=subprocess.STDOUT)

for line in iter(p.stdout.readline, b''):
    print(">>> " + line.decode().rstrip())

In [None]:
!sphinx_lm_convert -i etc/common_voice.lm -o etc/common_voice.lm.DMP

Need to change in config:
$CFG_CD_TRAIN = 'no';
$DEC_CFG_MODEL_NAME = "$CFG_EXPTNAME.ci_cont";


In [None]:
import subprocess, time, os, sys
cmd = ["sphinxtrain","-s", "decode", "run"]

p = subprocess.Popen(cmd,
                     stdout=subprocess.PIPE,
                     stderr=subprocess.STDOUT)

for line in iter(p.stdout.readline, b''):
    print(">>> " + line.decode().rstrip())

In [None]:
!ls 

## Predict on test set

In [None]:
%%bash
pocketsphinx_batch  -adcin yes  -cepdir test/  -cepext .wav  -ctl test.fileids -hmm model_parameters/common_voice.ci_cont/ -lm etc/common_voice.lm.DMP -dict etc/common_voice.dic -hyp predictions.hyp

In [None]:
def wer(s1, s2):
    """
    Computes the Word Error Rate, defined as the edit distance between the
    two provided sentences after tokenizing to words.
    Arguments:
    s1 (string): space-separated sentence
    s2 (string): space-separated sentence
    """
    
    # build mapping of words to integers
    b = set(s1.split() + s2.split())
    word2char = dict(zip(b, range(len(b))))
    
    # map the words to a char array (Levenshtein packages only accepts
    # strings)
    w1 = [chr(word2char[w]) for w in s1.split()]
    w2 = [chr(word2char[w]) for w in s2.split()]
    wer_lev = Lev.distance(''.join(w1), ''.join(w2))
    wer_inst = float(wer_lev)/len(s1.split()) * 100
    return 'WER: {0:.2f}'.format(wer_inst)

def cer(s1, s2):
    """
    Computes the Character Error Rate, defined as the edit distance.
    Arguments:
    s1 (string): space-separated sentence
    s2 (string): space-separated sentence
    """
    s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
    cer_inst = float(Lev.distance(s1, s2)) / len(s1) * 100
    return 'CER: {0:.2f}'.format(cer_inst)

In [None]:
import Levenshtein as Lev
def word_lev(s1, s2):
    """
    Computes the Word Error Rate, defined as the edit distance between the
    two provided sentences after tokenizing to words.
    Arguments:
        s1 (string): space-separated sentence
        s2 (string): space-separated sentence
    """

    # build mapping of words to integers
    b = set(s1.split() + s2.split())
    word2char = dict(zip(b, range(len(b))))

    # map the words to a char array (Levenshtein packages only accepts
    # strings)
    w1 = [chr(word2char[w]) for w in s1.split()]
    w2 = [chr(word2char[w]) for w in s2.split()]

    return Lev.distance(''.join(w1), ''.join(w2))

def char_lev(s1, s2):
    """
    Computes the Character Error Rate, defined as the edit distance.
    Arguments:
        s1 (string): space-separated sentence
        s2 (string): space-separated sentence
    """
    s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
    return Lev.distance(s1, s2)

In [None]:
import pandas as pd
target_data = pd.read_csv('cv-valid-test.csv')
predicted_data = pd.read_csv('predictions.hyp', names=['text'], header=None)
predicted_data['text'] = predicted_data['text'].str.replace(r"\(.*\)","")

In [None]:
index = 2
print(str(wer(predicted_data['text'][index], target_data['text'][index])))
print('Hypothesis: '+ predicted_data['text'][index] +'\nTarget: ' + target_data['text'][index])

In [None]:
total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0
verbose = True

for index in range(len(target_data)):
    transcript, reference = predicted_data['text'][index], target_data['text'][index]
    wer_inst = word_lev(transcript, reference)
    cer_inst = char_lev(transcript, reference)
    total_wer += wer_inst
    total_cer += cer_inst
    num_tokens += len(reference.split())
    num_chars += len(reference)
    if verbose:
        print("Ref:", reference.lower())
        print("Hyp:", transcript.lower())
        print("WER:", 100*float(wer_inst) / len(reference.split()), "CER:", 100*float(cer_inst) / len(reference), "\n")

In [None]:
wer = float(total_wer) / num_tokens
cer = float(total_cer) / num_chars

print('Test Summary \t'
      'Average WER {wer:.3f}\t'
      'Average CER {cer:.3f}\t'.format(wer=wer * 100, cer=cer * 100))