In [1]:
import csv
import random

input_files = [
    "UCSC Data/parallel_corpus_second_stage.tsv",
    "UOM Data/uom_parallel_corpus_second_stage.tsv"
]

output_file = "final_combined_shuffled_sinhala_tamil.tsv"

all_rows = []

# Read all input files and collect rows (skip header)
for filename in input_files:
    with open(filename, "r", encoding="utf-8") as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)  # Skip header
        for row in reader:
            if len(row) >= 2:
                all_rows.append(row)

# Shuffle all rows randomly
random.shuffle(all_rows)

# Write combined and shuffled rows to output file with header
with open(output_file, "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    # Write header
    writer.writerow(["source", "target"])
    # Write shuffled rows
    writer.writerows(all_rows)

print(f"Done! Combined and shuffled file saved as '{output_file}'")


Done! Combined and shuffled file saved as 'final_combined_shuffled_sinhala_tamil.tsv'


In [2]:
# File path
input_file = 'final_combined_shuffled_sinhala_tamil.tsv'

# Counters
missing_source = 0
missing_target = 0
total_lines = 0

# Read file and skip header
with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()[1:]  # Skip header

    for line in lines:
        total_lines += 1
        parts = line.strip().split('\t')
        if len(parts) != 2:
            continue  # Skip malformed rows

        source, target = parts
        if not source.strip():
            missing_source += 1
        if not target.strip():
            missing_target += 1

# Summary
print(f"üîç Total lines checked (excluding header): {total_lines}")
print(f"‚ö†Ô∏è Missing source (Tamil) sentences: {missing_source}")
print(f"‚ö†Ô∏è Missing target (Sinhala) sentences: {missing_target}")


üîç Total lines checked (excluding header): 52779
‚ö†Ô∏è Missing source (Tamil) sentences: 0
‚ö†Ô∏è Missing target (Sinhala) sentences: 0


In [3]:
# File path
input_file = 'final_combined_shuffled_sinhala_tamil.tsv'

# Set to store unique sentence pairs
unique_pairs = set()
duplicate_count = 0
total_lines = 0

# Read file and skip header
with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()[1:]  # Skip header

    for line in lines:
        total_lines += 1
        line = line.strip()
        if line in unique_pairs:
            duplicate_count += 1
        else:
            unique_pairs.add(line)

# Summary
print(f"üìä Total lines checked (excluding header): {total_lines}")
print(f"üîÅ Duplicate sentence pairs: {duplicate_count}")
print(f"‚úÖ Unique sentence pairs: {len(unique_pairs)}")


üìä Total lines checked (excluding header): 52779
üîÅ Duplicate sentence pairs: 23
‚úÖ Unique sentence pairs: 52756


In [5]:
# Input and output files
input_file = 'final_combined_shuffled_sinhala_tamil.tsv'
duplicates_file = 'final_duplicates.tsv'
without_duplicates_file = 'final_parallel_corpus_without_duplicates.tsv'

# Dictionaries to track data
line_to_rows = {}     # line -> list of row numbers
duplicates = []
without_duplicates = []
seen_once = set()

# Read file
with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

header = lines[0].strip()
data_lines = lines[1:]

# Track line numbers (starting from 2 because line 1 is header)
for idx, line in enumerate(data_lines, start=2):
    line = line.strip()
    if line not in line_to_rows:
        line_to_rows[line] = [idx]
        without_duplicates.append(line)
    else:
        line_to_rows[line].append(idx)
        if line not in seen_once:
            duplicates.append(line)
            seen_once.add(line)

# ‚úÖ Print duplicates with row numbers
print("üîÅ Duplicate Sentence Pairs with Row Numbers:")
for dup in duplicates:
    rows = line_to_rows[dup]
    print(f"Rows {rows} ‚Üí {dup}")

# üíæ Save duplicates for reference
with open(duplicates_file, 'w', encoding='utf-8') as df:
    df.write("row_numbers\tsource\ttarget\n")
    for dup in duplicates:
        source, target = dup.split('\t')
        row_numbers = ','.join(map(str, line_to_rows[dup]))
        df.write(f"{row_numbers}\t{source}\t{target}\n")

# üíæ Save without_duplicates data (only first occurrence)
with open(without_duplicates_file, 'w', encoding='utf-8') as cf:
    cf.write(header + '\n')
    for line in without_duplicates:
        cf.write(line + '\n')

# ‚úÖ Summary
print(f"\n‚úÖ Total duplicate sentence pairs: {len(duplicates)}")
print(f"üìÅ Duplicates saved to: {duplicates_file}")
print(f"üìÅ without_duplicates data saved to: {without_duplicates_file}")


üîÅ Duplicate Sentence Pairs with Row Numbers:
Rows [1565, 7560] ‚Üí ‡ÆÖ‡Æ∞‡Æö‡Ææ‡Æô‡Øç‡Æï ‡Æ™‡Ææ‡Æü‡Æö‡Ææ‡Æ≤‡Øà‡Æï‡Æ≥‡Øà‡ÆØ‡ØÅ‡ÆÆ‡Øç, ‡Æ™‡Æ≤‡Øç‡Æï‡Æ≤‡Øà‡Æï‡Øç ‡Æï‡Æ¥‡Æï‡Æô‡Øç‡Æï‡Æ≥‡Øà‡ÆØ‡ØÅ‡ÆÆ‡Øç ‡Æµ‡Æø‡Æü‡ØÅ‡ÆÆ‡ØÅ‡Æ±‡Øà ‡Æ®‡Ææ‡Æü‡Øç‡Æï‡Æ≥‡Æø‡Æ≤‡Øç ‡Æï‡Æ±‡Øç‡Æï‡Øà ‡ÆÖ‡Æ≤‡Æï‡ØÅ‡Æï‡Æ≥‡ØÅ‡Æï‡Øç‡Æï‡Ææ‡Æï‡Æ™‡Øç ‡Æ™‡ØÜ‡Æ±‡Øç‡Æ±‡ØÅ‡Æï‡Øç‡Æï‡ØÜ‡Ææ‡Æü‡ØÅ‡Æ§‡Øç‡Æ§‡Æ≤‡Øç,	‡∂ª‡∂¢‡∂∫‡∑ö ‡∂¥‡∑è‡∑É‡∂Ω‡∑ä ‡∑Ñ‡∑è ‡∑Ä‡∑í‡∑Å‡∑ä‡∑Ä‡∑Ä‡∑í‡∂Ø‡∑ä‡∂∫‡∑è‡∂Ω ‡∂±‡∑í‡∑Ä‡∑è‡∂©‡∑î ‡∂Ø‡∑í‡∂±‡∑Ä‡∂Ω‡∂Ø‡∑ì ‡∂ë‡∂∏ ‡∂∏‡∂∞‡∑ä‡∂∫‡∑É‡∑ä‡∂Æ‡∑è‡∂± ‡∂Ö‡∂∞‡∑ä‡∂∫‡∑è‡∂¥‡∂± ‡∂í‡∂ö‡∂ö ‡∑É‡∂≥‡∑Ñ‡∑è ‡∂Ω‡∂∂‡∑è‡∂Ø‡∑ì‡∂∏,
Rows [4368, 7617] ‚Üí ‡Æ§‡ØÜ‡Æô‡Øç‡Æï‡ØÅ‡Æï‡Øç ‡Æï‡ÆÆ‡Æ§‡Øç‡Æ§‡ØÜ‡Ææ‡Æ¥‡Æø‡Æ≤‡Øç, ‡Æ§‡ØÜ‡Æô‡Øç‡Æï‡ØÅ ‡Æö‡Ææ‡Æ∞‡Øç ‡Æµ‡Æ≥‡Æô‡Øç‡Æï‡Æ≥‡Øç ‡ÆÜ‡Æï‡Æø‡ÆØ‡Æµ‡Æ±‡Øç‡Æ±‡Æø‡Æ©‡Øç ‡ÆÖ‡Æ™‡Æø‡Æµ‡Æø‡Æ∞‡ØÅ‡Æ§‡Øç‡Æ§‡Æø‡ÆØ‡Øà‡ÆØ‡ØÅ‡ÆÆ‡Øç ‡ÆÆ‡Øá‡ÆÆ‡Øç‡Æ™‡Ææ‡Æü‡Øç‡Æü‡Øà‡ÆØ‡ØÅ‡ÆÆ‡Øç ‡Æí‡Æ¥‡ØÅ‡Æô‡Øç‡Æï‡ÆÆ‡Øà‡Æ§‡Øç‡Æ§‡Æ≤‡Øç,	‡∂¥‡∑ú‡∂Ω‡∑ä ‡∂ö‡∂ª‡∑ä‡∂∏‡∑è‡∂±‡∑ä‡∂≠‡∂∫‡∑ô‡∑Ñ‡∑í ‡∑É‡∑Ñ ‡∂¥‡∑ú‡∂Ω‡∑ä ‡∑É‡∂∏‡∑ä‡∂¥‡∂≠‡∑ä‡∑Ä‡∂Ω ‡∑É‡∂Ç‡∑Ä‡∂ª‡∑ä‡∂∞‡

In [6]:
import pandas as pd
import re
from collections import Counter
from itertools import chain

# Your input file path
input_file = 'final_parallel_corpus_without_duplicates.tsv'

# Load parallel corpus
df = pd.read_csv(input_file, delimiter='\t', names=['source', 'target'], encoding='utf-8', on_bad_lines='skip')

# Regex pattern: allow Tamil \u0B80-\u0BFF, Sinhala \u0D80-\u0DFF, whitespace, and basic punctuation (.,!?) only
pattern = r'[^\u0B80-\u0BFF\u0D80-\u0DFF\s.,!?]'

# Function to detect if text contains unwanted characters
def has_unwanted_char(text):
    return bool(re.search(pattern, str(text)))

# Function to extract unwanted characters from text
def extract_unwanted_chars(text):
    return re.findall(pattern, str(text))

# Find rows with unwanted characters
source_unwanted_mask = df['source'].apply(has_unwanted_char)
target_unwanted_mask = df['target'].apply(has_unwanted_char)

print("Rows with unwanted chars in source:", source_unwanted_mask.sum())
print("Rows with unwanted chars in target:", target_unwanted_mask.sum())

# Extract unwanted characters from those rows
source_unwanted_chars = df.loc[source_unwanted_mask, 'source'].apply(extract_unwanted_chars)
target_unwanted_chars = df.loc[target_unwanted_mask, 'target'].apply(extract_unwanted_chars)

# Flatten lists and count frequency
source_counts = Counter(chain.from_iterable(source_unwanted_chars))
target_counts = Counter(chain.from_iterable(target_unwanted_chars))

# Convert to DataFrame and sort descending by count
source_df = pd.DataFrame(source_counts.items(), columns=['character', 'count']).sort_values(by='count', ascending=False)
target_df = pd.DataFrame(target_counts.items(), columns=['character', 'count']).sort_values(by='count', ascending=False)

# Save to TSV for review
source_df.to_csv('unwanted charactes/final_unwanted_chars_source.tsv', sep='\t', index=False, encoding='utf-8')
target_df.to_csv('unwanted charactes/final_unwanted_chars_target.tsv', sep='\t', index=False, encoding='utf-8')

print("Unwanted characters and counts saved to 'final_unwanted_chars_source.tsv' and 'final_unwanted_chars_target.tsv'")


Rows with unwanted chars in source: 7375
Rows with unwanted chars in target: 8902
Unwanted characters and counts saved to 'final_unwanted_chars_source.tsv' and 'final_unwanted_chars_target.tsv'


In [8]:
import pandas as pd
import re
from collections import Counter
from itertools import chain

input_file = 'final_parallel_corpus_without_duplicates.tsv'

# Load file with skipping bad lines
df = pd.read_csv(input_file, delimiter='\t', names=['source', 'target'], encoding='utf-8', on_bad_lines='skip')

# Regex to match unwanted characters (not Tamil, Sinhala, space or basic punctuation)
pattern = r'[^\u0B80-\u0BFF\u0D80-\u0DFF\s.,!?]'

def has_unwanted_char(text):
    return bool(re.search(pattern, str(text)))

def extract_unwanted_chars(text):
    return re.findall(pattern, str(text))

# Filter rows with unwanted characters in source and target
source_unwanted_chars = df.loc[df['source'].apply(has_unwanted_char), 'source'].apply(extract_unwanted_chars)
target_unwanted_chars = df.loc[df['target'].apply(has_unwanted_char), 'target'].apply(extract_unwanted_chars)

# Combine all unwanted chars from both columns into one list
all_unwanted_chars = list(chain.from_iterable(source_unwanted_chars)) + list(chain.from_iterable(target_unwanted_chars))

# Count frequency of all unwanted characters combined
combined_counts = Counter(all_unwanted_chars)

# Convert to DataFrame
combined_df = pd.DataFrame(combined_counts.items(), columns=['character', 'count']).sort_values(by='count', ascending=False)

# Save to single TSV file
combined_df.to_csv('unwanted charactes/final_unwanted_characters_combined.tsv', sep='\t', index=False, encoding='utf-8')

print("Combined unwanted characters saved to 'unwanted charactes/final_unwanted_characters_combined.tsv'")


Combined unwanted characters saved to 'unwanted charactes/final_unwanted_characters_combined.tsv'


In [9]:
import pandas as pd

input_file = 'final_parallel_corpus_without_duplicates.tsv'
output_file = 'final_parallel_corpus_second_stage.tsv'

# Load file and use the first row as header
df = pd.read_csv(input_file, delimiter='\t', encoding='utf-8', on_bad_lines='skip', header=0)

# Character to remove - the ZERO WIDTH JOINER (Unicode U+200D)
char_to_remove = '\u200d'

# Remove the character from both columns
df['source'] = df['source'].str.replace(char_to_remove, '', regex=False)
df['target'] = df['target'].str.replace(char_to_remove, '', regex=False)

# Save cleaned dataframe with header
df.to_csv(output_file, sep='\t', index=False, encoding='utf-8')

print(f"Removed '{repr(char_to_remove)}' from both columns and saved to '{output_file}'")


Removed ''\u200d'' from both columns and saved to 'final_parallel_corpus_second_stage.tsv'


# Manuvaly Removed other Unwanted Characters