In [1]:
import pandas as pd
from datasets import load_dataset_builder, get_dataset_config_names, load_dataset
from huggingface_hub import HfApi
import numpy as np
import pdb
import logging
import re
from translate.storage.tmx import tmxfile
import matplotlib.pyplot as plt

In [2]:
ID = 'VarunGumma/IN22-Conv-Doc-Level'
# config = get_dataset_config_names(ID)

In [3]:
ds = load_dataset(ID)

# 0. Hugging Face login
- Necessary only for 'Gated' datasets on hugging face
- Specific to user (if I request access for a gated dataset, you'll need to request access also)

In [4]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 1. Data Visualization

In [5]:
from utils import create_conversion_dict, list_languages, list_languages_google

In [6]:
# Read language pair data from both sources
mt_hf_df = pd.read_csv('data/language_pairs_hf.csv')
mt_ext_df = pd.read_csv('data/language_pairs_external.csv')
mt = pd.concat([mt_hf_df, mt_ext_df])
mt.head(3)

Unnamed: 0,Author/Dataset,Language Pair,# Train Set,# Development Set,# Test Set
0,Bretagne/Korpus-divyezhek-brezhoneg-galleg,br-fr,61503,0,0
1,VishaliSekar/tamil_colloquial,ta-en,69,0,30
2,nandhinivaradharajan14/tamil-english-colloquia...,en-ta,197110,0,0


In [7]:
print(f"Unique pairs before normalization: {len(mt['Language Pair'].unique())}")
print(f"Unique datasets: {len(mt['Author/Dataset'].unique())}")

Unique pairs before normalization: 2445
Unique datasets: 561


In [8]:
iso_mappings = create_conversion_dict()

In [9]:
def normalize_pairs(mt_df, iso_map) -> pd.DataFrame:
    """
    Normalizes language pairs.
    1. Strips away script/locale information.
    2. Normalizes src/tgt direction
    """
    scripts = r'_[A-Z][a-z]{3}'
    endings = r'(-|_)[A-Z]{2,}'
    misc = "(-sursilv|-vallader|-tw|-valencia|_br|_tw)"

    for regex in [scripts, endings, misc]:
        mt_df['Language Pair'] = mt_df['Language Pair'].str.replace(regex, "", regex=True)
    
    mt_df['Language Pair'] = mt_df['Language Pair'].str.replace(r'2', "-", regex=True)
    
    stragglers = mt_df['Language Pair'].str.split('-', expand=True)
    stragglers = stragglers.iloc[:, 2:]
    stragglers = stragglers[stragglers.notna().any(axis=1)]
    mt_df = mt_df.drop(stragglers.index)

    mt_df[['lang_1', 'lang_2']] = mt_df['Language Pair'].str.split('-', expand=True)
    mt_df['lang_1'] = mt_df['lang_1'].apply(lambda x: iso_mappings.get(x, x))
    mt_df['lang_2'] = mt_df['lang_2'].apply(lambda x: iso_mappings.get(x, x))    
    mt_df['Language Pair'] = mt_df.apply(lambda row: f"{tuple((row['lang_1'], row['lang_2']))}", axis=1)
    
    missing_langs = mt_df[mt_df['lang_1'].isna() | mt_df['lang_2'].isna()]
    mt_df = mt_df.drop(missing_langs.index)
    
    mt_df['Language Pair'] = mt_df.apply(lambda row: f"{min(row['lang_1'], row['lang_2'])}-{max(row['lang_1'], row['lang_2'])}", axis=1)
    
    return mt_df

In [10]:
norm_mt = normalize_pairs(mt, iso_mappings)

In [11]:
print(f"Unique pairs after normalization: {len(norm_mt['Language Pair'].unique())}")
print(f"Unique datasets after normalization: {len(norm_mt['Author/Dataset'].unique())}") # one dataset dropped

Unique pairs after normalization: 1641
Unique datasets after normalization: 560


In [12]:
norm_mt.head()

Unnamed: 0,Author/Dataset,Language Pair,# Train Set,# Development Set,# Test Set,lang_1,lang_2
0,Bretagne/Korpus-divyezhek-brezhoneg-galleg,br-fr,61503,0,0,br,fr
1,VishaliSekar/tamil_colloquial,en-ta,69,0,30,ta,en
2,nandhinivaradharajan14/tamil-english-colloquia...,en-ta,197110,0,0,en,ta
3,jaksani/english-to-telugu,en-te,420671,0,0,en,te
4,ashuChufamo/parallel-corpus_en-am,am-en,27390,0,0,en,am


In [13]:
supported_langs = list_languages()
supported_langs_gv2 = list_languages_google()
supported_langs.update(supported_langs_gv2)

In [14]:
def is_in_Google(row, supported) -> pd.Series: 
    return row['lang_1'] in supported and row['lang_2'] in supported

In [15]:
# potential list of datasets/languages for NMT
candidates = norm_mt.loc[~norm_mt.apply(lambda x: is_in_Google(row=x,supported=supported_langs), axis=1)]
candidates = candidates.sort_values(by="# Train Set", ascending=False, inplace=False)

In [17]:
candidates.head(30)

Unnamed: 0,Author/Dataset,Language Pair,# Train Set,# Development Set,# Test Set,lang_1,lang_2
819,Helsinki-NLP/opus_dgt,bg-sh,1488507,0,0,bg,sh
826,Helsinki-NLP/opus_dgt,mt-sh,1450424,0,0,mt,sh
3224,projecte-aina/ES-AST_Parallel_Corpus,ast-es,704378,0,0,es,ast
710,Helsinki-NLP/opus-100,en-nn,486055,2000,2000,en,nn
93,Helsinki-NLP/OPUS-100,en-nn,486055,2000,2000,en,nn
2812,d0rj/ru-mhr-parallel,mhr-ru,417103,0,0,mhr,ru
43,AigizK/mari-russian-parallel-corpora,mhr-ru,413841,0,0,mhr,ru
547,Helsinki-NLP/multi_para_crawl,nb-ru,399050,0,0,nb,ru
81,Helsinki-NLP/OPUS-100,en-sh,267211,2000,2000,en,sh
722,Helsinki-NLP/opus-100,en-sh,267211,2000,2000,en,sh


In [14]:
# visualization pending

In [None]:
x_data = df_cleaned.head(10)['Language Pair']
x_data

In [None]:
x_data = [
    "Asturian-Spanish",
    "Norwegian Bokmål-Russian",
    "French-Plateau Malagasy",
    "Bodo-English",
    "Russian-Veps",
    "French-Kabyle",
    "English-Bodo",
    "English-Kashmiri",
    "Plateau Malagasy-Russian",
    "Montenegrin-English"
]


In [None]:
x_data = [
    "Asturian-XX",
    "Norwegian Bokmål-XX",
    "XX-Plateau Malagasy",
    "Bodo-XX",
    "XX-Veps",
    "XX-Kabyle",
    "XX-Bodo",
    "XX-Kashmiri",
    "Plateau Malagasy-XX",
    "Montenegrin-XX"
]

In [None]:
y_data = df_cleaned.head(10)['# Train Set']

In [None]:
y_data

In [None]:
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#d62728', '#e377c2', '#2ca02c', '#7f7f7f']

In [None]:
plt.figure(figsize=(14, 8))
bars = plt.bar(x_data, y_data, width=0.8, color=colors)  # Adjust width of bars

# Add labels and title
plt.xlabel('Language Pairs')
plt.ylabel('Number of Examples')
plt.title('Unsupported languages in Google Translate')
plt.ticklabel_format(style='plain', axis='y')  # Ensure y-axis is not in scientific notation

# Add value labels above bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height, f'{height:,}', 
             ha='center', va='bottom', fontsize=7, fontweight='bold')

# Adjust x-tick labels
plt.xticks(rotation=23, ha='right', fontsize=10, fontstyle='italic')  # Rotate and adjust font size

# Show the plot
plt.tight_layout()  # Ensure the layout is adjusted to prevent overlap
plt.show()


In [None]:
plt.figure(figsize=(14, 8))
bars = plt.bar(x_data, y_data, color='blue')

# Add labels and title
plt.xlabel('Language Pairs')
plt.ylabel('Number of Examples')
plt.title('Language Pairs: Where to next?')
plt.ticklabel_format(style='plain', axis='y')  # Ensure y-axis is not in scientific notation

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height, f'{height:,}', 
             ha='center', va='bottom', fontsize=8, fontweight='bold')

# Show the plot
plt.xticks(rotation=15, fontstyle='italic')
plt.show()

In [None]:
x_data

In [None]:
y_data

## 2. Update ```language_pairs_external.csv```

In [None]:
from utils import update_pairs
from math import factorial

In [None]:
mtex_pair = pd.read_csv('data/language_pairs_external.csv')
mtex_pair.head()

In [None]:
# Take a look at the helper function for ```update pairs```
help(update_pairs)

### Multiway example using  [HornMT](https://github.com/asmelashteka/HornMT) dataset from GitHub
The number of language pairs for a multiway is obtained with the permutation formula.
- Change save to True to save your changes

In [None]:
def permutations(n, r):
    '''Returns the number of permutations.'''
    return int(factorial(n) / factorial(n-r))

val = permutations(6, 2)
print(f"There will be {val} distinct pairs for the HornMT dataset.")

In [None]:
# Arguments for function
data_auth = 'LesanAI/HornMT' # if external check the main contributor to the dataset
langs = ['aa', 'am', 'en', 'om', 'so', 'ti']
rows = [0, 0, 2030] # multiway datasets will have the same n_rows
d_type = 'Multiway'
save = False # change to True; param is False only for demonstration purposes

In [None]:
df = update_pairs(data_auth, langs, rows, d_type, save)
df.tail(30) # 30 distinct pairs

### English-Centric example using  [Samantar](https://huggingface.co/datasets/ai4bharat/samanantar) dataset from Hugging Face
There will be *n-1* number of language pairs for an English-Centric dataset. There will be 11 unique pairs for Samantar.
- If the dataset doesn't exist in the ```mt_hf.csv``` dataset then you will manually add the dataset to ```mt_hf_external.csv```

In [None]:
# Arguments for function
data_auth = 'ai4bharat/samanantar' # if external check the main contributor to the dataset
configs = get_dataset_config_names(data_auth)
print(configs)

In [None]:
langs = configs.copy()
langs.append('en') # ensure English is in the list
d_type = 'English-Centric'
save = False # change to True; param is False only for demonstration purposes
print(langs)

English-Centric datasets may not have the same n_rows! Therefore we'll create a dictionary for each unique language pair containing their (train, validation, test) pair.

In [None]:
# this is easy if the config is similar to Samantar 
pairs = {}
for config in configs:
    rows = [0, 0, 0]
    builder = load_dataset_builder(data_auth, config)
    info = builder.info
    for split in info.splits:
        if split.startswith('train'):
            rows[0] = info.splits[split].num_examples
        if split.startswith('val'):
            rows[1] = info.splits[split].num_examples
        if split.startswith('test'):
            rows[2] = info.splits[split].num_examples
            
    pairs[config] = rows

In [None]:
# Otherwise you'll have to manually enter the number of rows or think of a programmatic solution.
test = {}
test['as'] = [141226, 0, 0]
test['bn'] = [8604579, 0, 0]

In [None]:
print("There will be 11 distinct pairs for the Samanantar dataset.")

In [None]:
df = update_pairs(data_auth, langs, pairs, d_type, save)
df.tail(11) 

### Simple parallel example using  [Filtered-Japanese-English-Parallel-Corpus](https://github.com/asmelashteka/HornMT) dataset from Hugging Face
A simple parallel dataset contains only 2 language pairs.

In [None]:
data_auth = 'Moleys/Filtered-Japanese-English-Parallel-Corpus' # if external check the main contributor to the dataset
langs = ['ja', 'en']
rows = [10739509, 0, 0] 
d_type = 'Simple Parallel'
save = False # change to True; param is False only for demonstration purposes

In [None]:
df = update_pairs(data_auth, langs, rows, d_type, save)
df.tail(1)

In [None]:
dataset_name = 'FBK-MT/mGeNTE'

In [None]:
configs = get_dataset_config_names(dataset_name)
configs

In [None]:
ds = load_dataset(dataset_name, configs[1])
# builder = load_dataset_builder()

In [None]:
ds

In [None]:
# data_auth = 'FBK-MT/gender-bias-PE' # if external check the main contributor to the dataset
data_auth = dataset_name
langs = ['en', 'it']
rows = [0, 0, 1500] 
d_type = 'Simple Parallel'
save = True # change to True; param is False only for demonstration purposes

In [None]:
df = update_pairs(data_auth, langs, rows, d_type, save)
df.tail(1)

In [None]:
df.tail()

## 3. Misc

In [None]:
with open('Belgium_justice.tmx', 'r') as fin:
    file = tmxfile(fin, 'nl', 'fr')

In [None]:
count = 0
for node in file.unit_iter():
    count += 1
#     print(node.source, node.target)

In [None]:
count

In [None]:
# re.fullmatch(pattern, configs[12])

In [None]:
import re

# Define the pattern for basic ISO language code pairs
# pattern = r'^[a-z]{2,3}(-|2)[a-z]{2,3}$'
pattern = r'[a-z]{2,3}((_|-)\w+)?(-|2)[a-z]{2,3}((_|-)\w+)?' # new pattern!

# Example language code pairs
codes = ['en-es', 'fr-de', 'zh-en', 'EN-es', 'eng-es_AM', 'ara_blahblah', 'iwslt14_de_en', 'amh_Ethi-arb_Arab']

# Filter valid codes
valid_codes = [code for code in codes if re.fullmatch(pattern, code)]

print(valid_codes)  # Output: ['en-es', 'fr-de', 'zh-en']


In [None]:
pattern = r'[a-z]{2,3}-[a-z]{2,3}(_-)?.*'
# pattern = 'en-zh'

In [None]:
string = "aya_dataset"
re.search(pattern, string)
# help(re.match)