# SETUP

In [1]:
%%capture
!pip install fastapi kaleido python-multipart uvicorn
!pip install --upgrade datasets
!pip install pandas pydub
!pip install librosa scipy soundfile

In [2]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from pydub import AudioSegment
import wave
import os
import io

In [3]:
path = r"C:\\Users\\maron\\OneDrive\\02-Documents\\03.PROJETS\\00.INFORMATIQUE\\02.AI\\WOLOF"
os.chdir(path)

# Functions

In [17]:
## the goal is to convert the speech array to WAV file in bulk
audio_file_path = r'SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\google_fleurs\audio\\'
def ByteWAV2WAV(audio_dict, df_name):
    try:
        # Extract 'bytes' from the dictionary
        audio_bytes = audio_dict.get('audio')['bytes']

        # Save the bytes as a WAV file
        filename = f"{df_name}_{audio_dict.get('id')}_WOL.wav"
        with open(os.path.join(audio_file_path, filename), 'wb') as f:
            f.write(audio_bytes)

        # Calculate the duration of the audio
        audio = AudioSegment.from_file(os.path.join(audio_file_path, filename))
        audio_duration = len(audio) / 1000.0  # Convert from milliseconds to seconds

        # Return the generated file path, audio duration, and filename
        return audio_duration, filename
    except Exception as e:
        print(f"Error processing id {audio_dict.get('id')}: {e}")
        return None

In [5]:
def create_id(df, id_max):
    df['id'] = range(id_max + 1, id_max + len(df) + 1)
    return df

# Data loading

In [7]:
dataset_google_fleurs = load_dataset("google/fleurs", 'wo_sn')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading data:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/126M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/286M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/87.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [8]:
print(f'the keys of the downloaded data are : {dataset_google_fleurs.keys()}')

the keys of the downloaded data are : dict_keys(['train', 'validation', 'test'])


In [10]:
dataset_google_fleurs_train = dataset_google_fleurs['train'].to_pandas()
dataset_google_fleurs_validation = dataset_google_fleurs['validation'].to_pandas()
dataset_google_fleurs_test = dataset_google_fleurs['test'].to_pandas()

# Visualization

In [11]:
dataset_google_fleurs_train.head(3)

Unnamed: 0,id,num_samples,path,audio,transcription,raw_transcription,gender,lang_id,language,lang_group_id
0,1450,300480,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2W\x12\x00WAVEfmt \x12\x00\x00...,danoo gis ne jànt bi dafay niroog anam ci doxa...,Danoo gis ne Jànt bi dafay niroog anam ci doxa...,1,97,Wolof,3
1,315,276480,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2\xe0\x10\x00WAVEfmt \x12\x00\...,ci ay mbindam day jëfandikoo ay kàddu yinga xa...,"Ci ay mbindam, day jëfandikoo ay kàddu yinga x...",0,97,Wolof,3
2,1023,172800,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2\x8c\n\x00WAVEfmt \x12\x00\x0...,gunóor yi mën nañu yàq ñam yi waral ay yaram y...,"Gunóor yi mën nañu yàq ñam yi, waral ay yaram ...",1,97,Wolof,3


In [12]:
dataset_google_fleurs_test.head(3)

Unnamed: 0,id,num_samples,path,audio,transcription,raw_transcription,gender,lang_id,language,lang_group_id
0,1784,383040,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2a\x17\x00WAVEfmt \x12\x00\x00...,groenlaand bariwul won way-dëkk ca tariix scan...,Groenlaand bariwul won way-dëkk. Ca tariix sca...,0,97,Wolof,3
1,1768,276480,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2\xe0\x10\x00WAVEfmt \x12\x00\...,yenn pàcc yu bari ci batimaa bii dañu leen tab...,Yenn pàcc yu bari ci batimaa bii dañu leen tab...,0,97,Wolof,3
2,2008,326400,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2\xec\x13\x00WAVEfmt \x12\x00\...,yeneen tomb yu nekk ci porogaraam bi ci bali b...,Yeneen tomb yu nekk ci porogaraam bi ci Bali b...,0,97,Wolof,3


In [13]:
dataset_google_fleurs_validation.head(3)

Unnamed: 0,id,num_samples,path,audio,transcription,raw_transcription,gender,lang_id,language,lang_group_id
0,1559,152640,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2Q\t\x00WAVEfmt \x12\x00\x00\x...,làkku italiyen warul nekk lu jafe noonu ndax b...,Làkku italiyen warul nekk lu jafe noonu ndax b...,1,97,Wolof,3
1,1652,128640,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2\xda\x07\x00WAVEfmt \x12\x00\...,diine ji gëna mag ci moldavie chrétien orthodo...,Diine ji gëna mag ci Moldavie chrétien orthodo...,0,97,Wolof,3
2,1586,417600,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2}\x19\x00WAVEfmt \x12\x00\x00...,slalom moo jiitu ci moom mu amee ci yeggali wu...,"Slalom moo jiitu ci moom, mu amee ci Yeggali w...",0,97,Wolof,3


# Grouping all the data

In [14]:
# Assuming dataset_waxal_df_train, dataset_waxal_df_validation, dataset_waxal_df_test, dataset_waxal_2_df_test are your DataFrames
dataframes = [dataset_google_fleurs_train, dataset_google_fleurs_validation, dataset_google_fleurs_test]

dataset_google_fleurs = pd.concat(dataframes, axis=0, ignore_index=True)
dataset_google_fleurs = create_id(dataset_google_fleurs, 0)
dataset_google_fleurs.head(3)

Unnamed: 0,id,num_samples,path,audio,transcription,raw_transcription,gender,lang_id,language,lang_group_id
0,1,300480,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2W\x12\x00WAVEfmt \x12\x00\x00...,danoo gis ne jànt bi dafay niroog anam ci doxa...,Danoo gis ne Jànt bi dafay niroog anam ci doxa...,1,97,Wolof,3
1,2,276480,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2\xe0\x10\x00WAVEfmt \x12\x00\...,ci ay mbindam day jëfandikoo ay kàddu yinga xa...,"Ci ay mbindam, day jëfandikoo ay kàddu yinga x...",0,97,Wolof,3
2,3,172800,C:\Users\maron\.cache\huggingface\datasets\dow...,{'bytes': b'RIFF2\x8c\n\x00WAVEfmt \x12\x00\x0...,gunóor yi mën nañu yàq ñam yi waral ay yaram y...,"Gunóor yi mën nañu yàq ñam yi, waral ay yaram ...",1,97,Wolof,3


#### saving the original data

In [16]:
dataset_google_fleurs.to_csv("SPEECH_TO_TEXT\DATA\BRUT\WOLOF_AUDIO_TRANS\google_fleurs\google_fleurs_brut.csv")

### Bytes to WAV conversion

In [18]:
# create a new dataframe to store the clean data from perrynelson
dataset_google_fleurs_clean = pd.DataFrame()
dataset_google_fleurs_clean = dataset_google_fleurs[["id", "transcription"]]

In [19]:
dataset_google_fleurs_clean['result'] = dataset_google_fleurs.apply(lambda row: ByteWAV2WAV(row, "google_fleurs"), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_google_fleurs_clean['result'] = dataset_google_fleurs.apply(lambda row: ByteWAV2WAV(row, "google_fleurs"), axis = 1)


In [20]:
# Create separate columns for filename and length
dataset_google_fleurs_clean[['length', 'filename']] = pd.DataFrame(dataset_google_fleurs_clean['result'].tolist(), index=dataset_google_fleurs_clean.index)
# Drop the 'result' column if you no longer need it
dataset_google_fleurs_clean = dataset_google_fleurs_clean.drop(columns=['result'])

dataset_google_fleurs_clean.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_google_fleurs_clean[['length', 'filename']] = pd.DataFrame(dataset_google_fleurs_clean['result'].tolist(), index=dataset_google_fleurs_clean.index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_google_fleurs_clean[['length', 'filename']] = pd.DataFrame(dataset_google_fleurs_clean['result'].tolist(), index=dataset_google_fleurs_clean.index)


Unnamed: 0,id,transcription,length,filename
0,1,danoo gis ne jànt bi dafay niroog anam ci doxa...,18.78,google_fleurs_1_WOL.wav
1,2,ci ay mbindam day jëfandikoo ay kàddu yinga xa...,17.28,google_fleurs_2_WOL.wav
2,3,gunóor yi mën nañu yàq ñam yi waral ay yaram y...,10.8,google_fleurs_3_WOL.wav


In [23]:
print("Amount of data for Alffa :", sum(dataset_google_fleurs_clean['length']))

Amount of data for Alffa : 40456.97999999993


: 

In [22]:
dataset_google_fleurs_clean.to_csv("SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\google_fleurs\google_fleurs_clean.csv")