# SETUP

In [1]:
%%capture
!pip install fastapi kaleido python-multipart uvicorn
!pip install --upgrade datasets
!pip install pandas pydub
!pip install librosa scipy soundfile

In [2]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from pydub import AudioSegment
import wave
import os
import io

In [3]:
path = r"C:\\Users\\maron\\OneDrive\\02-Documents\\03.PROJETS\\00.INFORMATIQUE\\02.AI\\WOLOF\\SPEECH_TO_TEXT"
os.chdir(path)

# Functions

In [5]:
## the goal is to convert the speech array to WAV file in bulk
audio_file_path = 'DATA\CLEANED\\alffa\\audio\\'
def array2WAV(row, df_name, sample_rate = 16000):
    # Convert NumPy array to AudioSegment object
    samples = row.get('audio')
    samples = (samples * (2**15 - 1)).astype(np.int16)
    audio_segment = AudioSegment(samples.tobytes(), sample_width = 2, frame_rate = sample_rate, channels = 1)

    # Save the AudioSegment as a WAV file
    filename = f"{df_name}_{row.get('id')}_WOL.wav"
    audio_segment.export(os.path.join(audio_file_path, filename), format="wav")

    # Calculate the duration of the audio
    audio = AudioSegment.from_file(os.path.join(audio_file_path, filename))
    audio_duration = len(audio) / 1000.0  # Convert from milliseconds to seconds

    # Return the generated file path, audio duration, and filename
    return audio_duration, filename

In [6]:
def create_id(df, id_max):
    df['id'] = range(id_max + 1, id_max + len(df) + 1)
    return df

# Data loading

In [7]:
custom_cache_dir = "DATA\BRUT\DATASET\\alffa\cache"
dataset_alffa = load_dataset("Isma/alffa_wolof")

In [8]:
print(f'the keys of the downloaded data are : {dataset_alffa.keys()}')

the keys of the downloaded data are : dict_keys(['train', 'dev'])


In [9]:
dataset_alffa_df_train = dataset_alffa['train'].to_pandas()
dataset_alffa_df_dev = dataset_alffa['dev'].to_pandas()

In [10]:
dataset_alffa_df_train.head(3)

Unnamed: 0,audio,speaker_id,transcription,filename
0,"[0.0, 0.00030517578, -0.0009765625, 0.03137207...",12,jén fa nga ko jàppe soo ko fa sange mu rëcc,WOL_12_lect_0505
1,"[3.0517578e-05, -0.00012207031, 0.00033569336,...",12,ngorsi kat masu maa jaar buroom di tataani,WOL_12_lect_0528
2,"[9.1552734e-05, -0.00018310547, 0.00036621094,...",12,dañu ko logal moo tax déggatuloo ko,WOL_12_lect_0748


In [11]:
dataset_alffa_df_dev.head(3)

Unnamed: 0,audio,speaker_id,transcription,filename
0,"[-0.003692627, -0.005340576, -0.0039978027, -0...",3,bi waalo sonnee ci moom nag ñu fas yéené rey ko,WOL_03_lect_0155
1,"[-0.008880615, -0.017181396, -0.015716553, -0....",3,caabi ji dafa réer moo tax mu dàjji bunt bi,WOL_03_lect_0641
2,"[0.008026123, 0.015472412, 0.009185791, 0.0153...",3,maa ngiy jékkiji ba ëllëg ci sarax si,WOL_03_lect_0952


### Grouping all the data

In [13]:
# Assuming dataset_waxal_df_train, dataset_waxal_df_validation, dataset_waxal_df_test, dataset_waxal_2_df_test are your DataFrames
dataframes = [dataset_alffa_df_train, dataset_alffa_df_dev]

dataset_alffa_df = pd.concat(dataframes, axis=0, ignore_index=True)
dataset_alffa_df = create_id(dataset_alffa_df, 0)
dataset_alffa_df.head(3)

Unnamed: 0,audio,speaker_id,transcription,filename,id
0,"[0.0, 0.00030517578, -0.0009765625, 0.03137207...",12,jén fa nga ko jàppe soo ko fa sange mu rëcc,WOL_12_lect_0505,1
1,"[3.0517578e-05, -0.00012207031, 0.00033569336,...",12,ngorsi kat masu maa jaar buroom di tataani,WOL_12_lect_0528,2
2,"[9.1552734e-05, -0.00018310547, 0.00036621094,...",12,dañu ko logal moo tax déggatuloo ko,WOL_12_lect_0748,3


In [14]:
dataset_alffa_df.to_csv("DATA\BRUT\DATASET\\alffa\\alffa_brut_df.csv")

### Sequence to WAV conversion

In [15]:
# create a new dataframe to store the clean data from perrynelson
dataset_alffa_clean_df = pd.DataFrame()
dataset_alffa_clean_df = dataset_alffa_df[["id", "transcription"]]
dataset_alffa_clean_df['result'] = dataset_alffa_df.apply(lambda row: array2WAV(row, "isma"), axis = 1)

# Create separate columns for filename and length
dataset_alffa_clean_df[['length', 'filename']] = pd.DataFrame(dataset_alffa_clean_df['result'].tolist(), index=dataset_alffa_clean_df.index)

# Drop the 'result' column if you no longer need it
dataset_alffa_clean_df = dataset_alffa_clean_df.drop(columns=['result'])

dataset_alffa_clean_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_alffa_clean_df['result'] = dataset_alffa_df.apply(lambda row: array2WAV(row, "isma"), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_alffa_clean_df[['length', 'filename']] = pd.DataFrame(dataset_alffa_clean_df['result'].tolist(), index=dataset_alffa_clean_df.index)


Unnamed: 0,id,transcription,length,filename
0,1,jén fa nga ko jàppe soo ko fa sange mu rëcc,5.968,isma_1_WOL.wav
1,2,ngorsi kat masu maa jaar buroom di tataani,3.946,isma_2_WOL.wav
2,3,dañu ko logal moo tax déggatuloo ko,5.608,isma_3_WOL.wav


In [16]:
dataset_alffa_clean_df.to_csv("DATA\CLEANED\\alffa\\alffa_clean_df.csv")