# SETUP

In [1]:
%%capture
!pip install fastapi kaleido python-multipart uvicorn
!pip install --upgrade datasets
!pip install pandas pydub
!pip install librosa scipy soundfile

In [2]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from pydub import AudioSegment
import wave
import os
import io
import librosa
from scipy.signal import resample as scipy_resample

In [3]:
path = r"C:\Users\maron\OneDrive\02-Documents\03.PROJETS\00.INFORMATIQUE\02.AI\WOLOF"
os.chdir(path)

# Functions

In [4]:
def resample_audio(input_path, path_audio,  target_sr=16000):
    # Load the audio file
    audio, sr = librosa.load(path_audio + input_path, sr=None)

    # Resample to 16 kHz
    #audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)

    # Calculate the duration of the original audio
    audio_duration = librosa.get_duration(y=audio, sr=sr)

    del audio
    del sr

    return audio_duration

# Data loading

In [5]:
df_waxal_git = pd.read_csv(r"SPEECH_TO_TEXT\DATA\BRUT\WOLOF_AUDIO_TRANS\waxal_git\transcription_+\wolof\transcriptions.csv")
df_waxal_git.head(10)

Unnamed: 0,Key,Transcriber,Target Language,Text,Status,Response,Created At
0,044a40d3,diallo.papali419@gmail.com,wolof,Ñu gis ci nataal bi ay nit ñu bari ñu génn ci ...,Approved,549: 1,
1,86a8fd9f,diallo.papali419@gmail.com,wolof,"Nataal bi, ñu ngi bind ci kowam ""Casamance"". A...",Approved,2602: 1,
2,e443accc,diallo.papali419@gmail.com,wolof,"Salaawaalekum ! Ñii de, mel nañ ne, ay nit ñu ...",Approved,3030: 1,
3,48a14b96,diallo.papali419@gmail.com,wolof,"Lii, ay nit lañu yu génn di ñaxtu. Jëm yi nag ...",Approved,2996: 1,
4,79a94c54,diallo.papali419@gmail.com,wolof,"Lii de ay nit lañu, ñoo xamante ne dañoo toog,...",Corrected,1884: 1,
5,,,,,,,
6,a13b57f9,mbayesow1998lg@gmail.com,wolof,Nataal bii maa ngi ciy janloog haa ay nit yu b...,Approved,3684: 1,
7,047cbbce,mbayesow1998lg@gmail.com,wolof,Waaw nataal bii nataal la boob ay nit ñu baree...,Approved,3643: 1,
8,239c07fa,diallo.papali419@gmail.com,wolof,"Bismillah ! Nataal bii mi ngi wane am mbooloo,...",Approved,2494: 1,
9,d7b57bff,diallo.papali419@gmail.com,wolof,"Waaw, nataal bii ñu ngi ci gis ay nit ñu yëkka...",Approved,3375: 1,


In [6]:
df_waxal_git = df_waxal_git.drop(columns=['Key', 'Transcriber', 'Target Language', 'Status', 'Created At'])
df_waxal_git = df_waxal_git.dropna(axis=0)
df_waxal_git.head(10)

Unnamed: 0,Text,Response
0,Ñu gis ci nataal bi ay nit ñu bari ñu génn ci ...,549: 1
1,"Nataal bi, ñu ngi bind ci kowam ""Casamance"". A...",2602: 1
2,"Salaawaalekum ! Ñii de, mel nañ ne, ay nit ñu ...",3030: 1
3,"Lii, ay nit lañu yu génn di ñaxtu. Jëm yi nag ...",2996: 1
4,"Lii de ay nit lañu, ñoo xamante ne dañoo toog,...",1884: 1
6,Nataal bii maa ngi ciy janloog haa ay nit yu b...,3684: 1
7,Waaw nataal bii nataal la boob ay nit ñu baree...,3643: 1
8,"Bismillah ! Nataal bii mi ngi wane am mbooloo,...",2494: 1
9,"Waaw, nataal bii ñu ngi ci gis ay nit ñu yëkka...",3375: 1
10,foto bii nag dafa mel ni benn ñaxtukat moo yék...,19: 2


In [7]:
def extract_filename(row):
    return "wolof\\" + row.split(': ')[1] + "\\" + row.split(': ')[0] + ".ogg"

In [8]:
df_waxal_git['filename'] = df_waxal_git['Response'].apply(lambda row: extract_filename(row))
df_waxal_git = df_waxal_git.drop(columns=['Response'])
df_waxal_git.head(4)

Unnamed: 0,Text,filename
0,Ñu gis ci nataal bi ay nit ñu bari ñu génn ci ...,wolof\1\549.ogg
1,"Nataal bi, ñu ngi bind ci kowam ""Casamance"". A...",wolof\1\2602.ogg
2,"Salaawaalekum ! Ñii de, mel nañ ne, ay nit ñu ...",wolof\1\3030.ogg
3,"Lii, ay nit lañu yu génn di ñaxtu. Jëm yi nag ...",wolof\1\2996.ogg


# Final cleaned data

In [11]:
path_audio = r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\waxal_git\audio\\"
df_waxal_git['length'] = df_waxal_git['filename'].apply(lambda row: resample_audio(row, path_audio))
df_waxal_git['transcription'] = df_waxal_git['Text']
df_waxal_git = df_waxal_git.drop(columns = ['Text'])
df_waxal_git.head(3)

Unnamed: 0,filename,length,transcription
0,wolof\1\549.ogg,22.617812,Ñu gis ci nataal bi ay nit ñu bari ñu génn ci ...
1,wolof\1\2602.ogg,26.177813,"Nataal bi, ñu ngi bind ci kowam ""Casamance"". A..."
2,wolof\1\3030.ogg,18.7735,"Salaawaalekum ! Ñii de, mel nañ ne, ay nit ñu ..."


In [12]:
print(sum(df_waxal_git['length']))

25643.60250000002


In [13]:
df_waxal_git.to_csv(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\waxal_git\waxal_git_clean.csv")