<a href="https://colab.research.google.com/github/Olusola84/OlusolaDocs/blob/main/Preprocessing001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Step 1: Load the Moses-format files (English and Yoruba)
en_file_path = '/content/drive/MyDrive/PROJECT RESEARCH /en-yo.txt-files/wikimedia.en-yo.en'  # English sentences
yo_file_path = '/content/drive/MyDrive/PROJECT RESEARCH /en-yo.txt-files/wikimedia.en-yo.yo'   # Yoruba sentences


In [2]:
# Open and read the files line by line
with open(en_file_path, 'r', encoding='utf-8') as en_file:
    en_sentences = en_file.readlines()

In [4]:
with open(yo_file_path, 'r', encoding='utf-8') as yo_file:
    yo_sentences = yo_file.readlines()

In [5]:
# Step 2: Check that both files have the same number of sentences
assert len(en_sentences) == len(yo_sentences), "The number of lines in both files doesn't match!"

In [7]:
print("Number of sentences in English:", len(en_sentences))
print("Number of sentences in Yoruba:", len(yo_sentences))

Number of sentences in English: 12478
Number of sentences in Yoruba: 12478


In [6]:
# Step 3: Create a DataFrame with the aligned sentences
df = pd.DataFrame({'english': en_sentences, 'yoruba': yo_sentences})

# Step 4: Format Conversion (remove newlines and strip whitespace)
df['english'] = df['english'].str.strip()
df['yoruba'] = df['yoruba'].str.strip()

print("Data after loading and stripping whitespace:")
print(df.head())

Data after loading and stripping whitespace:
                                             english  \
0  Henrietta Ogan is a Nigerian Business administ...   
1  She was succeeded by Ahmed Tijani Mora, a Phar...   
2                                         References   
3  Kola Oyewo (born March 26, 1946) is a Nigerian...   
4                                         Early life   

                                              yoruba  
0  Henrietta Ogan jẹ́ alámójútó okùn òwò ...  
1  Ó fi ipò yìí sílẹ̀ fún Ahmed Tijani Mor...  
2                                 Àwọn ìtọ́kasí  
3  '''Kola Oyewo''' (bìi 26 Oṣù kẹta 1946) jẹ́ òṣ...  
4                                           Ìgbà èwe  


In [8]:
# Now, you can save this DataFrame to a CSV file or continue with further preprocessing.
df.to_csv('aligned_parallel_data.csv', index=False)

# Step 5: Text Normalization (convert to lowercase, remove punctuation)
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()  # Remove leading/trailing whitespace
    return text

df['english'] = df['english'].apply(normalize_text)
df['yoruba'] = df['yoruba'].apply(normalize_text)

print("\nNormalized Data Sample:")
print(df.head())




Normalized Data Sample:
                                             english  \
0  henrietta ogan is a nigerian business administ...   
1  she was succeeded by ahmed tijani mora a pharm...   
2                                         references   
3  kola oyewo born march 26 1946 is a nigerian ve...   
4                                         early life   

                                              yoruba  
0  henrietta ogan je alamojuto okun owo ti o si j...  
1  o fi ipo yii sile fun ahmed tijani mora ogbont...  
2                                       awon itokasi  
3  kola oyewo bìi 26 oṣù kẹta 1946 jẹ òṣèré àti o...  
4                                           ìgbà èwe  


In [9]:
# Step 6: Tokenization (simple word-based tokenization)
df['english_tokens'] = df['english'].apply(lambda x: x.split())
df['yoruba_tokens'] = df['yoruba'].apply(lambda x: x.split())

print("\nTokenized Data Sample:")
print(df[['english_tokens', 'yoruba_tokens']].head())



Tokenized Data Sample:
                                      english_tokens  \
0  [henrietta, ogan, is, a, nigerian, business, a...   
1  [she, was, succeeded, by, ahmed, tijani, mora,...   
2                                       [references]   
3  [kola, oyewo, born, march, 26, 1946, is, a, ni...   
4                                      [early, life]   

                                       yoruba_tokens  
0  [henrietta, ogan, je, alamojuto, okun, owo, ti...  
1  [o, fi, ipo, yii, sile, fun, ahmed, tijani, mo...  
2                                    [awon, itokasi]  
3  [kola, oyewo, bìi, 26, oṣù, kẹta, 1946, jẹ, òṣ...  
4                                        [ìgbà, èwe]  


In [10]:
# Step 7: Cleaning (remove empty or incomplete sentence pairs)
df_clean = df[(df['english'].str.len() > 0) & (df['yoruba'].str.len() > 0)]

print("\nCleaned Data Sample:")
print(df_clean.head())


Cleaned Data Sample:
                                             english  \
0  henrietta ogan is a nigerian business administ...   
1  she was succeeded by ahmed tijani mora a pharm...   
2                                         references   
3  kola oyewo born march 26 1946 is a nigerian ve...   
4                                         early life   

                                              yoruba  \
0  henrietta ogan je alamojuto okun owo ti o si j...   
1  o fi ipo yii sile fun ahmed tijani mora ogbont...   
2                                       awon itokasi   
3  kola oyewo bìi 26 oṣù kẹta 1946 jẹ òṣèré àti o...   
4                                           ìgbà èwe   

                                      english_tokens  \
0  [henrietta, ogan, is, a, nigerian, business, a...   
1  [she, was, succeeded, by, ahmed, tijani, mora,...   
2                                       [references]   
3  [kola, oyewo, born, march, 26, 1946, is, a, ni...   
4                       

In [11]:
# Step 8: Sentence Length Filtering (optional)
MAX_LEN = 100
df_clean = df_clean[df_clean['english_tokens'].apply(len) <= MAX_LEN]
df_clean = df_clean[df_clean['yoruba_tokens'].apply(len) <= MAX_LEN]

print("\nFiltered Data Sample (by length):")
print(df_clean.head())


Filtered Data Sample (by length):
                                             english  \
0  henrietta ogan is a nigerian business administ...   
1  she was succeeded by ahmed tijani mora a pharm...   
2                                         references   
3  kola oyewo born march 26 1946 is a nigerian ve...   
4                                         early life   

                                              yoruba  \
0  henrietta ogan je alamojuto okun owo ti o si j...   
1  o fi ipo yii sile fun ahmed tijani mora ogbont...   
2                                       awon itokasi   
3  kola oyewo bìi 26 oṣù kẹta 1946 jẹ òṣèré àti o...   
4                                           ìgbà èwe   

                                      english_tokens  \
0  [henrietta, ogan, is, a, nigerian, business, a...   
1  [she, was, succeeded, by, ahmed, tijani, mora,...   
2                                       [references]   
3  [kola, oyewo, born, march, 26, 1946, is, a, ni...   
4          

In [16]:
print("Number of cleaned sentences with Max of 100 words lenght in English:", len(df_clean))
print("Number of cleaned sentences with Max of 100 words lenght in Yoruba: ",  len(df_clean))

Number of cleaned sentences with Max of 100 words lenght in English: 12227
Number of cleaned sentences with Max of 100 words lenght in Yoruba:  12227


In [17]:
# Step 9: Split into training and validation sets
train_df, val_df = train_test_split(df_clean, test_size=0.1, random_state=42)

print("\nTraining Data Sample:")
print(train_df.head())

print("\nValidation Data Sample:")
print(val_df.head())


Training Data Sample:
                                                english  \
9164  teams notes 1 spain 103569 4 2 england 85462 3...   
8336  the title is currently within the gift of the ...   
5204  this brought the total number of confirmed cas...   
4632  because of his legal background and antecedent...   
9902  on december 20 2020 he has signed with avtodor...   

                                                 yoruba  \
9164  teams notes 1 spain 103569 4 2 england 85462 3...   
8336  lóde òní àwọn ọba ló ń fi obìnrin joyè ìyálóde...   
5204  eyi mu ki apapo iye awon isele ti won ti fidi ...   
4632  gege bi ipinle re ninu imo ofin ati iriri re g...   
9902  ni oṣu kejila ọgunjo ọdun 2020 o ti fowo si iw...   

                                         english_tokens  \
9164  [teams, notes, 1, spain, 103569, 4, 2, england...   
8336  [the, title, is, currently, within, the, gift,...   
5204  [this, brought, the, total, number, of, confir...   
4632  [because, of, his, legal,

In [18]:
# Save preprocessed data for model training
train_df.to_csv('train_tts_data.csv', index=False)
val_df.to_csv('val_tts_data.csv', index=False)