In [2]:
!pip3 install pandas fastparquet



In [14]:
import pandas as pd

splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"], engine='fastparquet')
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"], engine='fastparquet')
print(f"Train size: {len(df_train)}, Validation size: {len(df_val)}")

Train size: 15343, Validation size: 3011


In [15]:
df_train.head()

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
0,উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...,WikiLeaks () is an international non-profit or...,bn,True,182,2006,
1,দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয় ?,The war in Europe concluded with an invasion o...,bn,True,48,Germany,
2,মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন...,Same-sex marriage in the United States expande...,bn,False,-1,no,
3,আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃ...,The exact number of Arab casualties is unknown...,bn,True,39,unknown,
4,বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে ?,"As Thomas Hall (2000) notes, ""The Sung Empire ...",bn,True,1219,17th century,


In [16]:
# Split train and validation sets into new dataframes for ar, ko and te based on the lang column
# Test sets
df_train_ar = df_train[df_train['lang'] == 'ar']
df_train_ko = df_train[df_train['lang'] == 'ko']
df_train_te = df_train[df_train['lang'] == 'te']

# Validation sets
df_val_ar = df_val[df_val['lang'] == 'ar']
df_val_ko = df_val[df_val['lang'] == 'ko']
df_val_te = df_val[df_val['lang'] == 'te']


In [17]:
# Tokenization of words and counting occurrences
import unicodedata as ud
from nltk import word_tokenize
    
def tokenize_question(question):
    # Remove all punctuation characters, keeping in mind that arabic is written from right to left
    question = ''.join([char for char in question if not ud.category(char).startswith('P')])
    # Tokenize the question into words
    words = word_tokenize(question)
    return words

def word_occurrence(questions):
    word_count = {}
    for question in questions:
        words = tokenize_question(question)
        for word in words:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    return word_count

# Get word occurrence for each language in train set
word_count_ar = word_occurrence(df_train_ar['question'].tolist())
word_count_ko = word_occurrence(df_train_ko['question'].tolist())
word_count_te = word_occurrence(df_train_te['question'].tolist())

# Get word occurrence for each language in validation set
word_count_ar_val = word_occurrence(df_val_ar['question'].tolist())
word_count_ko_val = word_occurrence(df_val_ko['question'].tolist())
word_count_te_val = word_occurrence(df_val_te['question'].tolist())


In [34]:
# Use the facebook/nllb-200-distilled-600M model to translate a word from a given language to English
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "facebook/nllb-200-distilled-600M"
target_lang = "eng_Latn"

tokenizer = AutoTokenizer.from_pretrained(model_name, tgt_lang=target_lang)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def translate_word(word, src_lang):
    tokenizer.src_lang = src_lang
    encoded = tokenizer(word, return_tensors="pt")
    generated_tokens = model.generate(encoded['input_ids'], forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_lang))
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    return translation

def translate_word_list(word_list, src_lang):
    translations = {}
    for word in word_list:
        translation = translate_word(word, src_lang)
        translations[word] = translation
    return translations

most_common_words_ar = [word for word, _ in sorted(word_count_ar.items(), key=lambda x: x[1], reverse=True)[:5]]
translations_ar = translate_word_list(most_common_words_ar, 'arb_Arab')
most_common_words_ar_val = [word for word, _ in sorted(word_count_ar_val.items(), key=lambda x: x[1], reverse=True)[:5]]
translations_ar_val = translate_word_list(most_common_words_ar_val, 'arb_Arab')

most_common_words_ko = [word for word, _ in sorted(word_count_ko.items(), key=lambda x: x[1], reverse=True)[:5]]
translations_ko = translate_word_list(most_common_words_ko, 'kor_Hang')
most_common_words_ko_val = [word for word, _ in sorted(word_count_ko_val.items(), key=lambda x: x[1], reverse=True)[:5]]
translations_ko_val = translate_word_list(most_common_words_ko_val, 'kor_Hang')

most_common_words_te = [word for word, _ in sorted(word_count_te.items(), key=lambda x: x[1], reverse=True)[:5]]
translations_te = translate_word_list(most_common_words_te, 'tel_Telu')
most_common_words_te_val = [word for word, _ in sorted(word_count_te_val.items(), key=lambda x: x[1], reverse=True)[:5]]
translations_te_val = translate_word_list(most_common_words_te_val, 'tel_Telu')

In [33]:
# Example question in Arabic
print("Example question Arabic:", df_train_ar.iloc[0]['question'])

# Training dataset statistics for Arabic
print("\nTraining Data Statistics for Arabic:")
print(f"Total size (ar): {len(df_train_ar)}")
print(f"Count of unique words (ar): {len(word_count_ar)}")
print(f"Count of all words (ar): {sum(word_count_ar.values())}")
print(f"Average number of words per question (ar): {sum(word_count_ar.values()) / len(df_train_ar)}")
print(f"5 most common words (ar):", sorted(word_count_ar.items(), key=lambda x: x[1], reverse=True)[:5])
print("Translations of the 5 most common words (ar):", translations_ar)

# Validation dataset statistics for Arabic
print("\nValidation Data Statistics for Arabic:")
print(f"Total size (ar): {len(df_val_ar)}")
print(f"Count of unique words (ar): {len(word_count_ar_val)}")
print(f"Count of all words (ar): {sum(word_count_ar_val.values())}")
print(f"Average number of words per question (ar): {sum(word_count_ar_val.values()) / len(df_val_ar)}")
print(f"5 most common words (ar):", sorted(word_count_ar_val.items(), key=lambda x: x[1], reverse=True)[:5])
print("Translations of the 5 most common words (ar):", translations_ar_val)

Example question Arabic: متى تدخلت روسيا في  الحرب الأهلية السورية؟

Training Data Statistics for Arabic:
Total size (ar): 2558
Count of unique words (ar): 5401
Count of all words (ar): 16191
Average number of words per question (ar): 6.3295543393276
5 most common words (ar): [('في', 593), ('من', 586), ('متى', 535), ('ما', 442), ('هو', 349)]
Translations of the 5 most common words (ar): {'في': 'In the', 'من': 'Who ?', 'متى': 'When ?', 'ما': 'What ?', 'هو': 'It is .'}

Validation Data Statistics for Arabic:
Total size (ar): 415
Count of unique words (ar): 1180
Count of all words (ar): 2617
Average number of words per question (ar): 6.306024096385542
5 most common words (ar): [('من', 113), ('في', 90), ('ما', 81), ('هو', 66), ('متى', 65)]
Translations of the 5 most common words (ar): {'من': 'Who ?', 'في': 'In the', 'ما': 'What ?', 'هو': 'It is .', 'متى': 'When ?'}


In [31]:
# Example question in Korean
print("Example question Korean:", df_train_ko.iloc[0]['question'])

# Training dataset statistics for Korean
print("\nTraining Data Statistics for Korean:")
print(f"Total size (ko): {len(df_train_ko)}")
print(f"Count of unique words (ko): {len(word_count_ko)}")
print(f"Count of all words (ko): {sum(word_count_ko.values())}")
print(f"Average number of words per question (ko): {sum(word_count_ko.values()) / len(df_train_ko)}")
print(f"5 most common words (ko):", sorted(word_count_ko.items(), key=lambda x: x[1], reverse=True)[:5])
print("Translations of the 5 most common words (ko):", translations_ko)

# Validation dataset statistics for Korean
print("\nValidation Data Statistics for Korean:")
print(f"Total size (ko): {len(df_val_ko)}")
print(f"Count of unique words (ko): {len(word_count_ko_val)}")
print(f"Count of all words (ko): {sum(word_count_ko_val.values())}")
print(f"Average number of words per question (ko): {sum(word_count_ko_val.values()) / len(df_val_ko)}")
print(f"5 most common words (ko):", sorted(word_count_ko_val.items(), key=lambda x: x[1], reverse=True)[:5])
print("Translations of the 5 most common words (ko):", translations_ko_val)

Example question Korean: 30년 전쟁의 승자는 누구인가?

Training Data Statistics for Korean:
Total size (ko): 2422
Count of unique words (ko): 4396
Count of all words (ko): 11846
Average number of words per question (ko): 4.8909991742361685
5 most common words (ko): [('가장', 527), ('무엇인가', 497), ('언제', 336), ('몇', 234), ('어디인가', 228)]
Translations of the 5 most common words (ko): {'가장': 'The most', '무엇인가': 'Something.', '언제': 'When?', '몇': 'A few.', '어디인가': 'Where are you?'}

Validation Data Statistics for Korean:
Total size (ko): 356
Count of unique words (ko): 828
Count of all words (ko): 1729
Average number of words per question (ko): 4.856741573033708
5 most common words (ko): [('무엇인가', 75), ('가장', 66), ('언제', 44), ('어디인가', 29), ('큰', 24)]
Translations of the 5 most common words (ko): {'무엇인가': 'Something.', '가장': 'The most', '언제': 'When?', '어디인가': 'Where are you?', '큰': 'Big one.'}


In [32]:
# Example question in Telugu
print("Example question Telugu:", df_train_te.iloc[0]['question'])

# Training dataset statistics for Telugu
print("\nTraining Data Statistics for Telugu:")
print(f"Total size (te): {len(df_train_te)}")
print(f"Count of unique words (te): {len(word_count_te)}")
print(f"Count of all words (te): {sum(word_count_te.values())}")
print(f"Average number of words per question (te): {sum(word_count_te.values()) / len(df_train_te)}")
print(f"5 most common words (te):", sorted(word_count_te.items(), key=lambda x: x[1], reverse=True)[:5])
print("Translations of the 5 most common words (te):", translations_te)

# Validation dataset statistics for Telugu
print("\nValidation Data Statistics for Telugu:")
print(f"Total size (te): {len(df_val_te)}")
print(f"Count of unique words (te): {len(word_count_te_val)}")
print(f"Count of all words (te): {sum(word_count_te_val.values())}")
print(f"Average number of words per question (te): {sum(word_count_te_val.values()) / len(df_val_te)}")
print(f"5 most common words (te):", sorted(word_count_te_val.items(), key=lambda x: x[1], reverse=True)[:5])
print("Translations of the 5 most common words (te):", translations_te_val)

Example question Telugu: ప్రపంచంలో  మొట్టమొదటి దూర విద్య విద్యాలయం ఏ దేశంలో స్థాపించబడింది ?

Training Data Statistics for Telugu:
Total size (te): 1355
Count of unique words (te): 2411
Count of all words (te): 7666
Average number of words per question (te): 5.657564575645757
5 most common words (te): [('ఎవరు', 274), ('ఏది', 192), ('ఎన్ని', 165), ('ఎప్పుడు', 154), ('ఏ', 142)]
Translations of the 5 most common words (te): {'ఎవరు': 'Who is it?', 'ఏది': 'What is it?', 'ఎన్ని': 'How many', 'ఎప్పుడు': 'When', 'ఏ': 'No, not at all.'}

Validation Data Statistics for Telugu:
Total size (te): 384
Count of unique words (te): 741
Count of all words (te): 2299
Average number of words per question (te): 5.986979166666667
5 most common words (te): [('ఏ', 92), ('ఏది', 76), ('ఎవరు', 74), ('భారతదేశంలో', 45), ('ఎంత', 40)]
Translations of the 5 most common words (te): {'ఏ': 'No, not at all.', 'ఏది': 'What is it?', 'ఎవరు': 'Who is it?', 'భారతదేశంలో': 'In India', 'ఎంత': 'How much'}
