In [2]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
  Installing build dependencies ... done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pydone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=325277 sha256=fdd7b339ff58ecf4ecef5c066af2a2977d73ffe3ca737e67032348ccbdc2a777
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58e02cec2ddb20ce3e59fad8d3c92a
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.3 pybind11-2.13.6
Note: you may need to restart the kernel to use updated packages.


In [3]:
import json
import os
import fasttext
import re
import csv

model = fasttext.load_model('lid201-model.bin')
lang_prediction = model.predict("mina ngithanda inyama")
lang_label = lang_prediction[0][0].split("__label__")[1]
lang_label

'zul_Latn'

In [26]:
import json
import os
import fasttext
import re
import csv

model = fasttext.load_model('lid201-model.bin')

def is_zulu_xhosa_swati(line):
    lang_prediction = model.predict(line)
    lang_label = lang_prediction[0][0].split("__label__")[1]
    if lang_label in ['zul_Latn']:
        return True
    return False

def is_english(line):
    lang_prediction = model.predict(line)
    lang_label = lang_prediction[0][0].split("__label__")[1]
    if lang_label in ['eng_Latn']:
        return True
    return False

def filter_parallel_text(input_file, accepted_file, rejected_file, model, target_accepted=100000):
    accepted_pairs = []
    rejected_pairs = []
    total_processed = 0
    unique_zulu = set()  # Track unique Zulu sentences
    
    while len(accepted_pairs) < target_accepted:
        with open(input_file, 'r', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            header = next(reader)  # Skip header
            
            for row in enumerate(reader):
                # Skip rows we've already processed in previous iterations
                if row[0] < total_processed:
                    continue
                    
                if len(row[1]) != 2:
                    continue
                    
                eng_text, zulu_text = row[1]
                
                # Check if both texts pass their respective language tests
                valid_eng = is_english(eng_text)
                valid_zulu = is_zulu_xhosa_swati(zulu_text)
                is_unique_zulu = zulu_text not in unique_zulu
                
                # Store the pair and the reason for rejection if any
                if valid_eng and valid_zulu and is_unique_zulu:
                    accepted_pairs.append((eng_text, zulu_text))
                    unique_zulu.add(zulu_text)  # Add to set of unique Zulu sentences
                    if len(accepted_pairs) >= target_accepted:
                        break
                else:
                    reason = []
                    if not valid_eng:
                        reason.append("Failed English check")
                    if not valid_zulu:
                        reason.append("Failed Zulu/Xhosa/Swati check")
                    if not is_unique_zulu:
                        reason.append("Duplicate Zulu sentence")
                    rejected_pairs.append({
                        "english": eng_text,
                        "zulu": zulu_text,
                        "reason": " & ".join(reason)
                    })
                
                total_processed += 1
                
                # Print progress periodically
                if total_processed % 1000 == 0:
                    print(f"Processed: {total_processed}, Accepted: {len(accepted_pairs)}, Unique Zulu: {len(unique_zulu)}")
        
        # If we've processed all rows but still haven't found enough accepted pairs
        if total_processed == sum(1 for row in csv.reader(open(input_file))) - 1:
            print("Reached end of file before finding enough accepted pairs.")
            break
    
    # Write accepted pairs to CSV
    with open(accepted_file, 'w', encoding='utf-8', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['english', 'zulu'])
        writer.writerows(accepted_pairs)
    
    # Write rejected pairs to JSON for better readability of rejection reasons
    with open(rejected_file, 'w', encoding='utf-8') as outfile:
        json.dump(rejected_pairs, outfile, indent=2, ensure_ascii=False)
    
    return accepted_pairs, rejected_pairs

# Usage
input_file = 'eng_zul_nllb_data.csv'
accepted_file = 'accepted_pairs.csv'
rejected_file = 'rejected_pairs.json'

accepted, rejected = filter_parallel_text(input_file, accepted_file, rejected_file, model)
print(f"\nFinal Results:")
print(f"Total pairs processed: {len(accepted) + len(rejected)}")
print(f"Accepted pairs: {len(accepted)}")
print(f"Rejected pairs: {len(rejected)}")
print(f"\nResults saved to:")
print(f"- Accepted pairs: {accepted_file}")
print(f"- Rejected pairs: {rejected_file}")

Processed: 1000, Accepted: 863, Unique Zulu: 863
Processed: 2000, Accepted: 1740, Unique Zulu: 1740
Processed: 3000, Accepted: 2631, Unique Zulu: 2631
Processed: 4000, Accepted: 3507, Unique Zulu: 3507
Processed: 5000, Accepted: 4408, Unique Zulu: 4408
Processed: 6000, Accepted: 5310, Unique Zulu: 5310
Processed: 7000, Accepted: 6204, Unique Zulu: 6204
Processed: 8000, Accepted: 7109, Unique Zulu: 7109
Processed: 9000, Accepted: 8003, Unique Zulu: 8003
Processed: 10000, Accepted: 8889, Unique Zulu: 8889
Processed: 11000, Accepted: 9773, Unique Zulu: 9773
Processed: 12000, Accepted: 10664, Unique Zulu: 10664
Processed: 13000, Accepted: 11561, Unique Zulu: 11561
Processed: 14000, Accepted: 12456, Unique Zulu: 12456
Processed: 15000, Accepted: 13356, Unique Zulu: 13356
Processed: 16000, Accepted: 14265, Unique Zulu: 14265
Processed: 17000, Accepted: 15140, Unique Zulu: 15140
Processed: 18000, Accepted: 16027, Unique Zulu: 16027
Processed: 19000, Accepted: 16910, Unique Zulu: 16910
Process

In [33]:
import re

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

with open("flores_zulu_val.txt", "r") as file:
    sentences = file.readlines()[:100]

cleaned_sentences = [preprocess_text(sentence.strip()) for sentence in sentences]

with open("cleaned_flores_zulu_val.txt", "w") as file:
    for sentence in cleaned_sentences:
        file.write(sentence + "\n")

print("Preprocessing complete. Cleaned text saved to 'cleaned_flores_zulu_val.txt'")

Preprocessing complete. Cleaned text saved to 'cleaned_flores_zulu_val.txt'


In [2]:
import pandas as pd

# Read the first CSV with English and Zulu
df1 = pd.read_csv('./final_merged_data_nllb.csv')
df2 = pd.read_csv('./final_merged_data_nllb_two.csv')
df3 = pd.read_csv('./final_merged_data_nllb_three.csv')

# # Read the second CSV with original and segmented Zulu
# df2 = pd.read_csv('segmentation_results_segmenter_three.csv')

# Create new dataframe with desired columns
final_df = pd.DataFrame({
    'english': df1['english'],
    'isizulu': df1['isizulu'],
    'segmenter_one': df1['segmenter_one'],
    'segmenter_two': df2['segmenter_two'],
    'segmenter_three': df3['segmenter_three']
})

final_df.to_csv('nllb_segmented_data.csv', index=False)

final_df.tail() 

Unnamed: 0,english,isizulu,segmenter_one,segmenter_two,segmenter_three
99995,it is important to take the time to listen and...,kubalulekile ukuthatha isikhathi sokulalela fu...,ku-balulek-ile uku-thath-a i-si-khathi so-ku-l...,ku-balulek-ile uku-thath-a isi-khathi so-ku-la...,ku-balulek-ile uku-thath-a isikhathi so-ku-lal...
99996,do you know a resource thats not on our list,uyalazi uhlelo oluhle olungekho ohlwini lwethu,u-ya-l-azi u-hlelo olu-hl-e olu-nge-kho o-hl-w...,u-ya-l-azi u-hlelo olu-hl-e olu-nge-kho o-hlwi...,uya-l-azi uhlelo oluhl-e olungekho o-hlwini lw...
99997,the local bank is obviously not likely to take...,ibhange lakho lendawo ngokusobala ngeke likhip...,i-bhang-e la-kho le-ndawo ng-o-ku-sobal-a ng-e...,i-bhang-e la-kho le-n-dawo ngo-ku-sobal-a ngek...,ibhange la-kho le-ndawo ngo-ku-sobal-a ngek-e ...
99998,ambulances were waiting,ambulanciers lalinde,a-m-bulanci-ers l-a-lind-e,a-m-bulancires lali-nde,ambulanci-ers lalind-e
99999,i am proud that speed humps will be erected he...,kuyangojabulisa ukuthi sekuzokwakhiwa izinciph...,ku-ya-ng-ojabulis-a uku-thi se-ku-zo-kw-akhiw-...,ku-ya-ng-ojabulis-a uku-thi se-ku-zokw-akhiw-a...,kuya-ng-ojabulis-a uku-thi sekuzokw-akhiw-a iz...


In [6]:
import pandas as pd

random_row = final_df.sample(n=1)

# Print the row in a nicely formatted way
print("\nRandom sample row:")
print("-" * 80)
print(f"English: {random_row['english'].values[0]}")
print(f"isiZulu: {random_row['isizulu'].values[0]}")
print(f"Segmented isiZulu: {random_row['segmented_isizulu'].values[0]}")
print("-" * 80)


Random sample row:
--------------------------------------------------------------------------------
English: it was devoted entirely to opening the eyes of those blinded by the doctrines of evolution and creationism
isiZulu: uma wasebenzisa ctrlshiftf khonake ctrle kuyodingeka ukuthi ziqalise futhi
Segmented isiZulu: uma wa-sebenzis-a ctrlshif-f khonak-e ctrl-e kuyo-dingek-a uku-thi zi-qalis-e futh-i
--------------------------------------------------------------------------------


In [27]:
index = 99998  # Change this to any index you want
row = final_df.iloc[index]

print("\nRow at index", index)
print("-" * 80)
print(f"English: {row['english']}")
print(f"isiZulu: {row['isizulu']}")
print(f"Segmented isiZulu: {row['segmented_isizulu']}")
print("-" * 80)


Row at index 99998
--------------------------------------------------------------------------------
English: ambulances were waiting
isiZulu: ambulanciers lalinde
Segmented isiZulu: ambulanci-ers lalind-e
--------------------------------------------------------------------------------
