# SETUP

In [1]:
%%capture
!pip install fastapi kaleido python-multipart uvicorn
!pip install --upgrade datasets
!pip install pandas pydub
!pip install --upgrade librosa scipy soundfile

In [2]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from pydub import AudioSegment
import wave
import os
import io
import librosa
from scipy.signal import resample as scipy_resample

In [3]:
path = r"C:\Users\maron\OneDrive\02-Documents\03.PROJETS\00.INFORMATIQUE\02.AI\WOLOF"
os.chdir(path)

# Functions

In [28]:
def resample_audio(input_path, path_audio):
    # Load the audio file
    try:
        audio, sr = librosa.load(path_audio + input_path, sr=None)

        # Calculate the duration of the original audio
        audio_duration = librosa.get_duration(y=audio, sr=sr)

        del audio
        del sr

        return audio_duration
    except:
        print('ERROR: failed to read:  ' + input_path )
        return None

In [14]:
def resample_audio_(input_path, path_audio, target_sr=16000):
    # Load the audio file
    audio, sr = librosa.load(path_audio + input_path, sr=None)

    # Resample to 16 kHz
    audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)

    del audio
    del sr

    return audio_resampled

In [6]:
def rename_path_to_path_complet(row, path, ext):
    return path + '\\' + row + ext

# Data loading

## common voice

In [7]:
voiceweb_clips = pd.read_csv(r"SPEECH_TO_TEXT\DATA\BRUT\WOLOF_AUDIO_TRANS\zenodo\tsv\voiceweb_clips.tsv", sep = "\t")
voiceweb_clips.head(2).T

Unnamed: 0,0,1
id,1,2
client_id,4af26658-5f2a-401c-aa35-748717079afa,4af26658-5f2a-401c-aa35-748717079afa
path,4af26658-5f2a-401c-aa35-748717079afa/02d98c90f...,4af26658-5f2a-401c-aa35-748717079afa/038677fc2...
sentence,"""Tey texeg ndaw ñi,ak mag ñi,ak waxambaane yee...",Jamonoy cëtëŋ j
original_sentence_id,02d98c90f6950423ebb7f25f436480e84c6baf149ab036...,038677fc2a6c36b549322f47fdd083a51a58e220ab10d8...
created_at,2020-09-15 17:45:30,2020-09-15 17:45:31
bucket,train,train
locale_id,2,2
needs_votes,1,1
is_valid,,1.0


In [8]:
voiceweb_sentence = pd.read_csv(r"SPEECH_TO_TEXT\DATA\BRUT\WOLOF_AUDIO_TRANS\zenodo\tsv\voiceweb_sentences.tsv", sep = "\t")
voiceweb_sentence.head(2).T

Unnamed: 0,0,1
id,0000dd0a940305d48c1bf2852f333629cb9db6b9e73d89...,0002480565c93f8619bb3d2f5f0af22613530552af2b8e...
text,Mbaa yaa ngi am jàmm.,Ndax solu ngeen?
is_used,1,1
created_at,2020-09-15 15:59:02,2020-09-15 15:59:03
bucket,train,train
locale_id,2,2
clips_count,2,2
version,1,1
source,fr_data_to_be_recorded_for_tts,fr_data_to_be_recorded_for_tts
has_valid_clip,0,0


In [9]:
voiceweb_sentence = voiceweb_sentence.dropna(subset=['text'])

## Female voices

In [10]:
female_test = pd.read_csv(r"SPEECH_TO_TEXT\DATA\BRUT\WOLOF_AUDIO_TRANS\zenodo\female_test.tsv", sep = "\t")
female_train = pd.read_csv(r"SPEECH_TO_TEXT\DATA\BRUT\WOLOF_AUDIO_TRANS\zenodo\female_train.tsv", sep = "\t")
female_validation = pd.read_csv(r"SPEECH_TO_TEXT\DATA\BRUT\WOLOF_AUDIO_TRANS\zenodo\female_valdiation.tsv", sep = "\t")

dataframes = [female_train, female_validation, female_test]
female = pd.concat(dataframes, axis=0, ignore_index=True)
female.head(2).T

Unnamed: 0,0,1
id,5599,5600
client_id,fbc2f100-357a-4840-b1df-8b562861e8dd,fbc2f100-357a-4840-b1df-8b562861e8dd
path,data-commonvoice/audios/fbc2f100-357a-4840-b1d...,data-commonvoice/audios/fbc2f100-357a-4840-b1d...
sentence,Nanga moytu jegeñante bi.,Asi rekk la taqalool ci yeneen gox yi
original_sentence_id,2627b1b38d06bf257a22a85ea67e240f9b43db39d4eba2...,2a048ddc15edf6bf932ad8b9ba0a4018866f79ae17f9ba...
created_at,2020-12-03 14:19:02,2020-12-03 14:19:03
bucket,train,train
locale_id,2,2
needs_votes,1,1
is_valid,,


In [11]:
female_red = female.drop(columns = ['is_valid', 'needs_votes', 'needs_votes', 'locale_id', 'bucket', 'created_at', 'validated_at', 'client_id'])
female_red.head(2).T

Unnamed: 0,0,1
id,5599,5600
path,data-commonvoice/audios/fbc2f100-357a-4840-b1d...,data-commonvoice/audios/fbc2f100-357a-4840-b1d...
sentence,Nanga moytu jegeñante bi.,Asi rekk la taqalool ci yeneen gox yi
original_sentence_id,2627b1b38d06bf257a22a85ea67e240f9b43db39d4eba2...,2a048ddc15edf6bf932ad8b9ba0a4018866f79ae17f9ba...
caracter_count,25,37
word_count,4,8


## Male voice

In [12]:
male_test = pd.read_csv(r"SPEECH_TO_TEXT\DATA\BRUT\WOLOF_AUDIO_TRANS\zenodo\male_test.tsv", sep = "\t")
male_train = pd.read_csv(r"SPEECH_TO_TEXT\DATA\BRUT\WOLOF_AUDIO_TRANS\zenodo\male_train.tsv", sep = "\t")
male_validation = pd.read_csv(r"SPEECH_TO_TEXT\DATA\BRUT\WOLOF_AUDIO_TRANS\zenodo\male_validation.tsv", sep = "\t")

dataframes = [male_train, male_validation, male_test]
male = pd.concat(dataframes, axis=0, ignore_index=True)
male.head(2).T

Unnamed: 0,0,1
id,4080,4082
client_id,ba524146-26b0-4d13-81fd-5a16547d9ce3,ba524146-26b0-4d13-81fd-5a16547d9ce3
path,data-commonvoice/audios/ba524146-26b0-4d13-81f...,data-commonvoice/audios/ba524146-26b0-4d13-81f...
sentence,Te boo wooteeb xeex ñu néew ñoo la ciy fekksi.,Ñëw leen ci waat yii boog
original_sentence_id,1fea98a25ffdfda32850ef55825f81834c8810ba2357ea...,19e9b3b644e6a8bd0c470e2e71a1d4da1a8d37ec6007d5...
created_at,2020-12-02 09:09:09,2020-12-02 09:09:11
bucket,train,train
locale_id,2,2
needs_votes,1,1
is_valid,,


In [13]:
male_red = male.drop(columns = ['is_valid', 'needs_votes', 'needs_votes', 'locale_id', 'bucket', 'created_at', 'validated_at', 'client_id'])
male_red.head(2).T

Unnamed: 0,0,1
id,4080,4082
path,data-commonvoice/audios/ba524146-26b0-4d13-81f...,data-commonvoice/audios/ba524146-26b0-4d13-81f...
sentence,Te boo wooteeb xeex ñu néew ñoo la ciy fekksi.,Ñëw leen ci waat yii boog
original_sentence_id,1fea98a25ffdfda32850ef55825f81834c8810ba2357ea...,19e9b3b644e6a8bd0c470e2e71a1d4da1a8d37ec6007d5...
caracter_count,46,25
word_count,10,6


# Data preparation

In [15]:
zenodo = pd.DataFrame()

merged_df = voiceweb_clips.merge(voiceweb_sentence, left_on='original_sentence_id', right_on='id')

zenodo['transcription'] = merged_df['text']
zenodo['filename'] = merged_df['path']

zenodo.head(2).T

Unnamed: 0,0,1
transcription,"""Tey texeg ndaw ñi,ak mag ñi,ak waxambaane yee...","""Tey texeg ndaw ñi,ak mag ñi,ak waxambaane yee..."
filename,4af26658-5f2a-401c-aa35-748717079afa/02d98c90f...,fbc2f100-357a-4840-b1df-8b562861e8dd/02d98c90f...


In [22]:
import IPython.display as ipd
import random

rand_int = random.randint(0, len(zenodo)-1)

ipd.Audio(data = resample_audio_(zenodo["filename"].iloc[rand_int], r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\\"), autoplay = True, rate = 16000)

In [23]:
print(zenodo["transcription"].iloc[rand_int])

Wax naa la nga génn.


In [24]:
zenodo.to_csv(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\zenodo_cleaned.csv")

In [25]:
zenodo['path'] = zenodo['filename'].apply(lambda row: rename_path_to_path_complet(row, r"DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio", ext=''))
zenodo.head(1).T

Unnamed: 0,0
transcription,"""Tey texeg ndaw ñi,ak mag ñi,ak waxambaane yee..."
filename,4af26658-5f2a-401c-aa35-748717079afa/02d98c90f...
path,DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\4a...


# Final cleaned data

In [29]:
path_audio = r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\\"
zenodo['length'] = zenodo['filename'].apply(lambda row: resample_audio(row, path_audio))
zenodo.head(3).T

  audio, sr = librosa.load(path_audio + input_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


ERROR: failed to read:  29f41f59-63cd-4220-82b0-3d6796fca217/029d11b55186a36cea1c3fea6e7a21e7f0e94d3526c014c6b8cb132f3f3a5131.mp3
ERROR: failed to read:  29f41f59-63cd-4220-82b0-3d6796fca217/107951bdcd76285969c6e74dd044b4b601c654ee14b26b1b3f35930261b1db66.mp3
ERROR: failed to read:  29f41f59-63cd-4220-82b0-3d6796fca217/11ea1a103a00a3adc7d7c4b0a2a8936124c77704c274c9e3d9c89b919cebf45e.mp3
ERROR: failed to read:  29f41f59-63cd-4220-82b0-3d6796fca217/1155f5caea70cb67398589896f16b3cc896ffe05dc8ce79a9019dddf0fbed7eb.mp3
ERROR: failed to read:  29f41f59-63cd-4220-82b0-3d6796fca217/0e646c55c474410a9d5d21b2c2f709c381bc6479925706fcf75e2a24349dcd74.mp3
ERROR: failed to read:  ba524146-26b0-4d13-81fd-5a16547d9ce3/2f6296b54fb86357419db29d817932d18fef59316aa84391a01ee7a88c813f71.mp3
ERROR: failed to read:  ba524146-26b0-4d13-81fd-5a16547d9ce3/5ca5045d353d1b2c0955418161248287d49551a77507d3019d022b41a6e03101.mp3
ERROR: failed to read:  ba524146-26b0-4d13-81fd-5a16547d9ce3/5ef51a52f3f105505103c286123be

Unnamed: 0,0,1,2
transcription,"""Tey texeg ndaw ñi,ak mag ñi,ak waxambaane yee...","""Tey texeg ndaw ñi,ak mag ñi,ak waxambaane yee...",Jamonoy cëtëŋ j
filename,4af26658-5f2a-401c-aa35-748717079afa/02d98c90f...,fbc2f100-357a-4840-b1df-8b562861e8dd/02d98c90f...,4af26658-5f2a-401c-aa35-748717079afa/038677fc2...
path,DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\4a...,DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\fb...,DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\4a...
length,6.384,5.616,3.744


In [30]:
zenodo = zenodo.dropna(subset=['length'])

In [31]:
print(sum(zenodo['length']))

163136.7146122475


In [32]:
def clean_text(row):
    return row.rstrip('"').lstrip('"')

In [33]:
zenodo['transcription'] = zenodo['transcription'].apply(clean_text)

In [34]:
zenodo.to_csv(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\zenodo_cleaned.csv")