# SETUP

In [2]:
%%capture
!pip install fastapi kaleido python-multipart uvicorn
!pip install --upgrade datasets
!pip install pandas pydub
!pip install librosa scipy soundfile

In [4]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from pydub import AudioSegment
import wave
import os
import io

In [5]:
path = r"C:\\Users\\maron\\OneDrive\\02-Documents\\03.PROJETS\\00.INFORMATIQUE\\02.AI\\WOLOF\\SPEECH_TO_TEXT"
os.chdir(path)

# Functions

In [22]:
## the goal is to convert the speech array to WAV file in bulk
audio_file_path = 'DATA\CLEANED\serge\\audio\\'
def ByteWAV2WAV(audio_dict, df_name):
    try:
        # Extract 'bytes' from the dictionary
        audio_bytes = audio_dict.get('audio')['bytes']

        # Save the bytes as a WAV file
        filename = f"{df_name}_{audio_dict.get('id')}_WOL.wav"
        with open(os.path.join(audio_file_path, filename), 'wb') as f:
            f.write(audio_bytes)

        # Calculate the duration of the audio
        audio = AudioSegment.from_file(os.path.join(audio_file_path, filename))
        audio_duration = len(audio) / 1000.0  # Convert from milliseconds to seconds

        # Return the generated file path, audio duration, and filename
        return audio_duration, filename
    except Exception as e:
        print(f"Error processing id {audio_dict.get('id')}: {e}")
        return None

In [12]:
def create_id(df, id_max):
    df['id'] = range(id_max + 1, id_max + len(df) + 1)
    return df

# Data loading

In [14]:
custom_cache_dir = "DATA\BRUT\DATASET\serge\cache"

# Specify the custom cache directory
dataset_serge = load_dataset("serge-wilson/wolof_speech_transcription")

In [15]:
print(f'the keys of the downloaded data are : {dataset_serge.keys()}')

the keys of the downloaded data are : dict_keys(['train', 'test'])


In [16]:
dataset_serge_df_train = dataset_serge['train'].to_pandas()
dataset_serge_df_test = dataset_serge['test'].to_pandas()

In [17]:
dataset_serge_df_train.head(3)

Unnamed: 0,audio,sentence
0,{'bytes': b'RIFF\xe4B\x02\x00WAVEfmt \x10\x00\...,moo taxit feebaru jigéen dafa bari façons
1,{'bytes': b'RIFF\xa4\x19\x02\x00WAVEfmt \x10\x...,bu nawetee du génn te gànnaayoowul ak permiyaabal
2,{'bytes': b'RIFF\x84\x81\x01\x00WAVEfmt \x10\x...,kii de dafay wanteer njëg li neex na lool


In [18]:
dataset_serge_df_test.head(3)

Unnamed: 0,audio,sentence
0,{'bytes': b'RIFFd]\x02\x00WAVEfmt \x10\x00\x00...,foofa fépp ñaay nañu ko ba samp fa suñu raaya
1,{'bytes': b'RIFF\xe4\x8a\x02\x00WAVEfmt \x10\x...,xàmb yi lañuy waaja toxal ci seen sanc bu bees bi
2,{'bytes': b'RIFFd\x9b\x01\x00WAVEfmt \x10\x00\...,demal ne sa tànta mu abal ma gurmetam ba


# Grouping all the data

In [19]:
# Assuming dataset_waxal_df_train, dataset_waxal_df_validation, dataset_waxal_df_test, dataset_waxal_2_df_test are your DataFrames
dataframes = [dataset_serge_df_train, dataset_serge_df_test]

dataset_serge_df = pd.concat(dataframes, axis = 0, ignore_index = True)
dataset_serge_df = create_id(dataset_serge_df, 0)
dataset_serge_df.head(3)

Unnamed: 0,audio,sentence,id
0,{'bytes': b'RIFF\xe4B\x02\x00WAVEfmt \x10\x00\...,moo taxit feebaru jigéen dafa bari façons,1
1,{'bytes': b'RIFF\xa4\x19\x02\x00WAVEfmt \x10\x...,bu nawetee du génn te gànnaayoowul ak permiyaabal,2
2,{'bytes': b'RIFF\x84\x81\x01\x00WAVEfmt \x10\x...,kii de dafay wanteer njëg li neex na lool,3


In [20]:
dataset_serge_df.to_csv("DATA\BRUT\DATASET\serge\serge_brut_df.csv")

# Bytes to WAV conversion

In [23]:
# create a new dataframe to store the clean data from perrynelson
dataset_serge_clean_df = pd.DataFrame()
dataset_serge_clean_df['id'] = dataset_serge_df["id"].copy()
dataset_serge_clean_df["transcription"] = dataset_serge_df["sentence"].copy()
# Assuming you have dataset_serge_clean_df as your DataFrame
dataset_serge_clean_df['result'] = dataset_serge_df.apply(lambda row: ByteWAV2WAV(row, "serge"), axis=1)

# Create separate columns for filename and length
dataset_serge_clean_df[['length', 'filename']] = pd.DataFrame(dataset_serge_clean_df['result'].tolist(), index=dataset_serge_clean_df.index)

# Drop the 'result' column if you no longer need it
dataset_serge_clean_df = dataset_serge_clean_df.drop(columns=['result'])

dataset_serge_clean_df.head(3)

Unnamed: 0,id,transcription,length,filename
0,1,moo taxit feebaru jigéen dafa bari façons,4.63,serge_1_WOL.wav
1,2,bu nawetee du génn te gànnaayoowul ak permiyaabal,4.3,serge_2_WOL.wav
2,3,kii de dafay wanteer njëg li neex na lool,3.083,serge_3_WOL.wav


In [24]:
dataset_serge_clean_df.to_csv("DATA\CLEANED\serge\serge_clean_df.csv")