In [2]:
# File path
input_file = 'full dataset/parallel_corpus.tsv'

# Counters
missing_source = 0
missing_target = 0
total_lines = 0

# Read file and skip header
with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()[1:]  # Skip header

    for line in lines:
        total_lines += 1
        parts = line.strip().split('\t')
        if len(parts) != 2:
            continue  # Skip malformed rows

        source, target = parts
        if not source.strip():
            missing_source += 1
        if not target.strip():
            missing_target += 1

# Summary
print(f"🔍 Total lines checked (excluding header): {total_lines}")
print(f"⚠️ Missing source (Tamil) sentences: {missing_source}")
print(f"⚠️ Missing target (Sinhala) sentences: {missing_target}")


🔍 Total lines checked (excluding header): 47813
⚠️ Missing source (Tamil) sentences: 0
⚠️ Missing target (Sinhala) sentences: 0


In [None]:
# File path
input_file = 'full dataset/parallel_corpus.tsv'

# Set to store unique sentence pairs
unique_pairs = set()
duplicate_count = 0
total_lines = 0

# Read file and skip header
with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()[1:]  # Skip header

    for line in lines:
        total_lines += 1
        line = line.strip()
        if line in unique_pairs:
            duplicate_count += 1
        else:
            unique_pairs.add(line)

# Summary
print(f"📊 Total lines checked (excluding header): {total_lines}")
print(f"🔁 Duplicate sentence pairs: {duplicate_count}")
print(f"✅ Unique sentence pairs: {len(unique_pairs)}")


📊 Total lines checked (excluding header): 47813
🔁 Duplicate sentence pairs: 40
✅ Unique sentence pairs: 47773


In [8]:
# Input and output files
input_file = 'full dataset/parallel_corpus.tsv'
duplicates_file = 'full dataset/duplicates.tsv'
without_duplicates_file = 'full dataset/parallel_corpus_without_duplicates.tsv'

# Dictionaries to track data
line_to_rows = {}     # line -> list of row numbers
duplicates = []
without_duplicates = []
seen_once = set()

# Read file
with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

header = lines[0].strip()
data_lines = lines[1:]

# Track line numbers (starting from 2 because line 1 is header)
for idx, line in enumerate(data_lines, start=2):
    line = line.strip()
    if line not in line_to_rows:
        line_to_rows[line] = [idx]
        without_duplicates.append(line)
    else:
        line_to_rows[line].append(idx)
        if line not in seen_once:
            duplicates.append(line)
            seen_once.add(line)

# ✅ Print duplicates with row numbers
print("🔁 Duplicate Sentence Pairs with Row Numbers:")
for dup in duplicates:
    rows = line_to_rows[dup]
    print(f"Rows {rows} → {dup}")

# 💾 Save duplicates for reference
with open(duplicates_file, 'w', encoding='utf-8') as df:
    df.write("row_numbers\tsource\ttarget\n")
    for dup in duplicates:
        source, target = dup.split('\t')
        row_numbers = ','.join(map(str, line_to_rows[dup]))
        df.write(f"{row_numbers}\t{source}\t{target}\n")

# 💾 Save without_duplicates data (only first occurrence)
with open(without_duplicates_file, 'w', encoding='utf-8') as cf:
    cf.write(header + '\n')
    for line in without_duplicates:
        cf.write(line + '\n')

# ✅ Summary
print(f"\n✅ Total duplicate sentence pairs: {len(duplicates)}")
print(f"📁 Duplicates saved to: {duplicates_file}")
print(f"📁 without_duplicates data saved to: {without_duplicates_file}")


🔁 Duplicate Sentence Pairs with Row Numbers:
Rows [3170, 7270] → அதை கைப்பற்றியதும் கியூபா இரண்டு பகுதியாக பிரிந்துவிடும் .	එය යටත් කරගත් විට කියුබාව කොටස් දෙකකට බෙදීයයි .
Rows [6326, 13238] → ஹிக்கடுவ நகர சபை எல்லைக்குள் பெயர் இடப்பட்ட பாதைகளில் செல்ல முடியாது .	හික්කඩුව නගර සභා බල ප්‍රදේශයේ නම් කරපු පාරවල යන්න බැහැ .
Rows [11593, 15418] → தமிழ் மக்கள் ஏற்றுக்கொள்ளக்கூடிய அரசியல் தீர்வொன்றை பெற்றுத்தரவே நான் தொடர்ச்சியாக நடவடிக்கை மேற்கொண்டேன் .	මා දිගටම කටයුතු කළේ දෙමළ ජනතාවට පිළිගත හැකි දේශපාලන විසඳුමක් ලබාදීම සඳහායි .
Rows [25463, 25470] → தேர்தல் வாக்குறுதிகளை சீர்தூக்கிப் பார்க்க வேண்டும் நடக்கவுள்ள ஜனாதிபதித் தேர்தலிலும் இதை எதிர்பார்க்கலாம்.	මැතිවරණ පොරොන්දු එකිනෙක සසඳා බැලිය යුතු වන අතර ම ඉදිරියේදී පැවැත්වීමට නියමිත ජනාධිපතිවරණයේදී ද මෙවැන්නක් බලාපොරොත්තු විය හැකිය
Rows [25466, 25473] → வேட்பாளரின் வெற்றி வாய்ப்பு மாத்திரமன்றி வாக்குறுதியின் நடைமுறைச்சாத்தியத் தன்மையையும் கவனத்தில் கொள்ள வேண்டும்.	ඡන්ද අපේක්ෂකයින්  මැතිවරණ පොරොන්දු ලබාදීම පමණක් නොව ඒවා ක්‍රියාත්මක කිරීමට ද අවධ

In [12]:
import pandas as pd
import re
from collections import Counter
from itertools import chain

# Your input file path
input_file = 'full dataset/parallel_corpus_without_duplicates.tsv'

# Load parallel corpus
df = pd.read_csv(input_file, delimiter='\t', names=['source', 'target'], encoding='utf-8', on_bad_lines='skip')

# Regex pattern: allow Tamil \u0B80-\u0BFF, Sinhala \u0D80-\u0DFF, whitespace, and basic punctuation (.,!?) only
pattern = r'[^\u0B80-\u0BFF\u0D80-\u0DFF\s.,!?]'

# Function to detect if text contains unwanted characters
def has_unwanted_char(text):
    return bool(re.search(pattern, str(text)))

# Function to extract unwanted characters from text
def extract_unwanted_chars(text):
    return re.findall(pattern, str(text))

# Find rows with unwanted characters
source_unwanted_mask = df['source'].apply(has_unwanted_char)
target_unwanted_mask = df['target'].apply(has_unwanted_char)

print("Rows with unwanted chars in source:", source_unwanted_mask.sum())
print("Rows with unwanted chars in target:", target_unwanted_mask.sum())

# Extract unwanted characters from those rows
source_unwanted_chars = df.loc[source_unwanted_mask, 'source'].apply(extract_unwanted_chars)
target_unwanted_chars = df.loc[target_unwanted_mask, 'target'].apply(extract_unwanted_chars)

# Flatten lists and count frequency
source_counts = Counter(chain.from_iterable(source_unwanted_chars))
target_counts = Counter(chain.from_iterable(target_unwanted_chars))

# Convert to DataFrame and sort descending by count
source_df = pd.DataFrame(source_counts.items(), columns=['character', 'count']).sort_values(by='count', ascending=False)
target_df = pd.DataFrame(target_counts.items(), columns=['character', 'count']).sort_values(by='count', ascending=False)

# Save to TSV for review
source_df.to_csv('unwanted charactes/unwanted_chars_source.tsv', sep='\t', index=False, encoding='utf-8')
target_df.to_csv('unwanted charactes/unwanted_chars_target.tsv', sep='\t', index=False, encoding='utf-8')

print("Unwanted characters and counts saved to 'unwanted_chars_source.tsv' and 'unwanted_chars_target.tsv'")


Rows with unwanted chars in source: 6235
Rows with unwanted chars in target: 24717
Unwanted characters and counts saved to 'unwanted_chars_source.tsv' and 'unwanted_chars_target.tsv'


In [14]:
import pandas as pd
import re
from collections import Counter
from itertools import chain

input_file = 'full dataset/parallel_corpus_without_duplicates.tsv'

# Load file with skipping bad lines
df = pd.read_csv(input_file, delimiter='\t', names=['source', 'target'], encoding='utf-8', on_bad_lines='skip')

# Regex to match unwanted characters (not Tamil, Sinhala, space or basic punctuation)
pattern = r'[^\u0B80-\u0BFF\u0D80-\u0DFF\s.,!?]'

def has_unwanted_char(text):
    return bool(re.search(pattern, str(text)))

def extract_unwanted_chars(text):
    return re.findall(pattern, str(text))

# Filter rows with unwanted characters in source and target
source_unwanted_chars = df.loc[df['source'].apply(has_unwanted_char), 'source'].apply(extract_unwanted_chars)
target_unwanted_chars = df.loc[df['target'].apply(has_unwanted_char), 'target'].apply(extract_unwanted_chars)

# Combine all unwanted chars from both columns into one list
all_unwanted_chars = list(chain.from_iterable(source_unwanted_chars)) + list(chain.from_iterable(target_unwanted_chars))

# Count frequency of all unwanted characters combined
combined_counts = Counter(all_unwanted_chars)

# Convert to DataFrame
combined_df = pd.DataFrame(combined_counts.items(), columns=['character', 'count']).sort_values(by='count', ascending=False)

# Save to single TSV file
combined_df.to_csv('unwanted charactes/unwanted_characters_combined.tsv', sep='\t', index=False, encoding='utf-8')

print("Combined unwanted characters saved to 'unwanted charactes/unwanted_characters_combined.tsv'")


Combined unwanted characters saved to 'unwanted charactes/unwanted_characters_combined.tsv'


In [17]:
import pandas as pd

input_file = 'full dataset/parallel_corpus_without_duplicates.tsv'
output_file = 'full dataset/parallel_corpus_second_stage.tsv'

# Load file and use the first row as header
df = pd.read_csv(input_file, delimiter='\t', encoding='utf-8', on_bad_lines='skip', header=0)

# Character to remove - the ZERO WIDTH JOINER (Unicode U+200D)
char_to_remove = '\u200d'

# Remove the character from both columns
df['source'] = df['source'].str.replace(char_to_remove, '', regex=False)
df['target'] = df['target'].str.replace(char_to_remove, '', regex=False)

# Save cleaned dataframe with header
df.to_csv(output_file, sep='\t', index=False, encoding='utf-8')

print(f"Removed '{repr(char_to_remove)}' from both columns and saved to '{output_file}'")


Removed ''\u200d'' from both columns and saved to 'full dataset/parallel_corpus_second_stage.tsv'


# Manuvaly Removed other Unwanted Characters