# Setup

In [1]:
import pandas as pd
import os
import regex

In [2]:
os.chdir(r"E:\IA\WOLOF")

# Functions

In [3]:
def store_in_pickles(df, directory, name):
    # Ensure the directory exists
    os.makedirs(directory, exist_ok=True)
    
    # Store each row in a separate pickle file
    for i in range(len(df)):
        df.iloc[[i]].to_pickle(os.path.join(directory, name + f'_data_{i}.pkl'))

In [4]:
chars_to_ignore_regex = r'[кɲớˈ\'\xa0\r\n]'
chars_to_keep_regex = r'[^a-zA-Z\sёñïóŋöäàéîā́сđớ\'ˈоɗɲtx їüúaëçèĩã̈ûjämсукéеɓìs️öŋïõăаrýànóvñlò̃qẽyfƭhgziâwíńồpêáôùёībkр]'
replace_dict = {'ï': 'a', 'î': 'i', 'ā': 'a', 'ƭ': 'c', 'ī': 'i', 'ä': 'a', 'ɗ': 'nd', 'ń': 'ñ', 'ồ': 'o',
                'ї': 'i', 'ü': 'u', 'ù': 'u', 'ú': 'u', 'ă': 'ã', '̃': '', 'â': 'a', '́': '', 'û': 'u',
                '̈': '', 'è': 'e', 'ç': 's', 'ö': 'o', 'ý': 'y', 'ì': 'i', 'í': 'i', '̀': '', 'ɓ':'b', 'ô':'o',
                'ê':'e', 'à':'a'}
def remove_special_characters(sentence):
    tex = ""
    for word in sentence.split(','):
        if tex != "":
            tex = tex + ", " + word.rstrip().lstrip()
        else:
            tex = word
    text = regex.sub(chars_to_keep_regex, '', tex).lower() + " "
    text = regex.sub(chars_to_ignore_regex, '', text) + " "

    for key, value in replace_dict.items():
        text = regex.sub(key, value, text) + " "
    return text

# Loading Data

In [5]:
alffa_cleaned = pd.read_pickle(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\zz_total_cleaned\alffa_cleaned_tot.pkl")

In [6]:
alffa_git_clean = pd.read_pickle(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\zz_total_cleaned\alffa_git_clean_tot.pkl")

In [7]:
google_fleurs_clean = pd.read_pickle(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\zz_total_cleaned\google_fleurs_clean_tot.pkl")

In [8]:
waxal_clean = pd.read_pickle(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\zz_total_cleaned\waxal_clean_tot.pkl")

In [9]:
waxal_git_clean = pd.read_pickle(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\zz_total_cleaned\waxal_git_clean_tot.pkl")

In [10]:
zenodo_clean = pd.read_pickle(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\zz_total_cleaned\zenodo_tot.pkl")

In [11]:
dataframes = [zenodo_clean, waxal_git_clean, waxal_clean, google_fleurs_clean, alffa_git_clean, alffa_cleaned]
df_clean_tot = pd.concat(dataframes)
df_clean_tot.head(3)

Unnamed: 0,transcription,filename,path,length,audio,id
0,"Tey texeg ndaw ñi,ak mag ñi,ak waxambaane yeek...",4af26658-5f2a-401c-aa35-748717079afa/02d98c90f...,DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\4a...,6.384,"[-3.6948222e-13, 4.405365e-13, -1.7053026e-13,...",
1,"Tey texeg ndaw ñi,ak mag ñi,ak waxambaane yeek...",fbc2f100-357a-4840-b1df-8b562861e8dd/02d98c90f...,DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\fb...,5.616,"[4.3655746e-11, 4.3655746e-11, -1.4551915e-11,...",
2,Jamonoy cëtëŋ j,4af26658-5f2a-401c-aa35-748717079afa/038677fc2...,DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\4a...,3.744,"[-7.275958e-12, -1.8189894e-12, -1.4551915e-11...",


In [12]:
df_clean_tot = df_clean_tot[df_clean_tot['length'] <= 6]

# Preprocessing

In [13]:
df_clean_tot["transcription"] = df_clean_tot["transcription"].apply(remove_special_characters)
df_clean_tot.head(3)

Unnamed: 0,transcription,filename,path,length,audio,id
1,tey texeg ndaw ñi ak mag ñi ak waxambaane yeek...,fbc2f100-357a-4840-b1df-8b562861e8dd/02d98c90f...,DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\fb...,5.616,"[4.3655746e-11, 4.3655746e-11, -1.4551915e-11,...",
2,jamonoy cëtëŋ j,4af26658-5f2a-401c-aa35-748717079afa/038677fc2...,DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\4a...,3.744,"[-7.275958e-12, -1.8189894e-12, -1.4551915e-11...",
3,jamonoy cëtëŋ j,fbc2f100-357a-4840-b1df-8b562861e8dd/038677fc2...,DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\fb...,2.376,"[-7.2759576e-11, 0.0, -5.820766e-10, 1.1641532...",


In [14]:
df_clean_tot = df_clean_tot.drop(columns = ['filename', 'path', 'length', 'id'])
df_clean_tot.head(5).T

Unnamed: 0,1,2,3,4,5
transcription,tey texeg ndaw ñi ak mag ñi ak waxambaane yeek...,jamonoy cëtëŋ j,jamonoy cëtëŋ j,da ngeen a xëm,da ngeen a xëm
audio,"[4.3655746e-11, 4.3655746e-11, -1.4551915e-11,...","[-7.275958e-12, -1.8189894e-12, -1.4551915e-11...","[-7.2759576e-11, 0.0, -5.820766e-10, 1.1641532...","[-1.8189894e-12, 4.5474735e-12, -3.637979e-12,...","[2.6193447e-10, -2.910383e-11, 1.4551915e-10, ..."


# Store data in different pickle files

In [15]:
df_clean_tot['transcription'].to_csv(r"SPEECH_TO_TEXT\DATA\PREPROCESSED\total_data\data_transcription_tot.csv")

In [16]:
from sklearn.model_selection import train_test_split
train_size = 0.85
validation_size = 0.15

# Randomly split into training and validation+test
df_clean_tot_train, df_clean_tot_validation_test = train_test_split(
    df_clean_tot, test_size=validation_size, random_state = 0
)

# Further split validation+test into validation and test
df_clean_tot_validation, df_clean_tot_test = train_test_split(
    df_clean_tot_validation_test, test_size = 0.005, random_state = 0
)

In [17]:
direct_train = r"SPEECH_TO_TEXT\DATA\PREPROCESSED\train"
store_in_pickles(df_clean_tot_train, direct_train, "train")

In [18]:
direct_validation = r"SPEECH_TO_TEXT\DATA\PREPROCESSED\validation"
store_in_pickles(df_clean_tot_validation, direct_validation, "validation")

In [19]:
direct_test = r"SPEECH_TO_TEXT\DATA\PREPROCESSED\test"
store_in_pickles(df_clean_tot_test, direct_test, "test")