# SETUP

In [1]:
%%capture
!pip install fastapi kaleido python-multipart uvicorn
!pip install --upgrade datasets
!pip install pandas pydub
!pip install librosa scipy soundfile

In [2]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from pydub import AudioSegment
import wave
import os
import io

In [3]:
path = r"C:\\Users\\maron\\OneDrive\\02-Documents\\03.PROJETS\\00.INFORMATIQUE\\02.AI\\WOLOF\\SPEECH_TO_TEXT"
os.chdir(path)

# Functions

In [4]:
## the goal is to convert the speech array to WAV file in bulk
audio_file_path = 'DATA\CLEANED\waxal\\audio\\'
def ByteWAV2WAV(audio_dict, df_name):
    try:
        # Extract 'bytes' from the dictionary
        audio_bytes = audio_dict.get('audio')['bytes']

        # Save the bytes as a WAV file
        filename = f"{df_name}_{audio_dict.get('id')}_WOL.wav"
        with open(os.path.join(audio_file_path, filename), 'wb') as f:
            f.write(audio_bytes)

        # Calculate the duration of the audio
        audio = AudioSegment.from_file(os.path.join(audio_file_path, filename))
        audio_duration = len(audio) / 1000.0  # Convert from milliseconds to seconds

        # Return the generated file path, audio duration, and filename
        return audio_duration, filename
    except Exception as e:
        print(f"Error processing id {audio_dict.get('id')}: {e}")
        return None

In [5]:
def create_id(df, id_max):
    df['id'] = range(id_max + 1, id_max + len(df) + 1)
    return df

# Data loading

In [6]:
custom_cache_dir = "DATA\BRUT\DATASET\waxal\cache"

dataset_waxal = load_dataset("perrynelson/waxal-wolof")
dataset_waxal_2 = load_dataset("perrynelson/waxal-wolof2")

In [7]:
print(f'the keys of the downloaded data are : {dataset_waxal.keys()}')
print(f'the keys of the downloaded data are : {dataset_waxal_2.keys()}')

the keys of the downloaded data are : dict_keys(['train', 'validation', 'test'])
the keys of the downloaded data are : dict_keys(['test'])


In [8]:
dataset_waxal_df_train = dataset_waxal['train'].to_pandas()
dataset_waxal_df_validation = dataset_waxal['validation'].to_pandas()
dataset_waxal_df_test = dataset_waxal['test'].to_pandas()
dataset_waxal_2_df_test = dataset_waxal_2['test'].to_pandas()

# Visualization

In [9]:
dataset_waxal_df_train.head(3)

Unnamed: 0,audio,duration,transcription
0,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...,33.18,"Lii de ay nit lañu, ñoo xamante ne dañoo toog,..."
1,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...,18.84,Lii de ab nataal la boob mel na ne maa ngi ciy...
2,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...,17.72,"Waaw, lii sunu mbokk yi nekk ci casamance lañ ..."


In [10]:
dataset_waxal_2_df_test.head(3)

Unnamed: 0,audio,duration,transcription
0,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...,25.76,"Lii, ay nit lañu yu génn di ñaxtu. Jëm yi nag ..."
1,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...,18.78,"Salaawaalekum ! Ñii de, mel nañ ne, ay nit ñu ..."
2,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...,37.14,Nataal bi ab nataal la boo xamante yni maa ngi...


# Grouping all the data

In [11]:
# Assuming dataset_waxal_df_train, dataset_waxal_df_validation, dataset_waxal_df_test, dataset_waxal_2_df_test are your DataFrames
dataframes = [dataset_waxal_df_train, dataset_waxal_df_validation, dataset_waxal_df_test, dataset_waxal_2_df_test]

dataset_waxal_df = pd.concat(dataframes, axis=0, ignore_index=True)
dataset_waxal_df = create_id(dataset_waxal_df, 0)
dataset_waxal_df.head(3)

Unnamed: 0,audio,duration,transcription,id
0,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...,33.18,"Lii de ay nit lañu, ñoo xamante ne dañoo toog,...",1
1,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...,18.84,Lii de ab nataal la boob mel na ne maa ngi ciy...,2
2,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...,17.72,"Waaw, lii sunu mbokk yi nekk ci casamance lañ ...",3


#### saving the original data

In [12]:
dataset_waxal_df.to_csv("DATA\BRUT\DATASET\waxal\waxal_brut.csv")

### Bytes to WAV conversion

In [13]:
# create a new dataframe to store the clean data from perrynelson
dataset_waxal__clean_df = pd.DataFrame()
dataset_waxal__clean_df = dataset_waxal_df[["id", "transcription"]]
dataset_waxal__clean_df['result'] = dataset_waxal_df.apply(lambda row: ByteWAV2WAV(row, "perrynelson"), axis = 1)

# Create separate columns for filename and length
dataset_waxal__clean_df[['length', 'filename']] = pd.DataFrame(dataset_waxal__clean_df['result'].tolist(), index=dataset_waxal__clean_df.index)

# Drop the 'result' column if you no longer need it
dataset_waxal__clean_df = dataset_waxal__clean_df.drop(columns=['result'])

dataset_waxal__clean_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_waxal__clean_df['result'] = dataset_waxal_df.apply(lambda row: ByteWAV2WAV(row, "perrynelson"), axis = 1)


Unnamed: 0,id,transcription,length,filename
0,1,"Lii de ay nit lañu, ñoo xamante ne dañoo toog,...",33.178,perrynelson_1_WOL.wav
1,2,Lii de ab nataal la boob mel na ne maa ngi ciy...,18.838,perrynelson_2_WOL.wav
2,3,"Waaw, lii sunu mbokk yi nekk ci casamance lañ ...",17.714,perrynelson_3_WOL.wav


In [14]:
dataset_waxal__clean_df.to_csv("DATA\CLEANED\waxal\waxal__clean_df.csv")