In [1]:
%pip install datasets
%pip install transformers torch
%pip install sentencepiece
%pip install sacremoses

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
#https://huggingface.co/datasets/coastalcph/tydi_xor_rc
from datasets import load_dataset
import pandas as pd
from collections import Counter
import torch
from transformers import MarianMTModel, MarianTokenizer
import nltk
from nltk.tokenize import word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emilh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
print(device)

cuda


In [5]:
dataset = load_dataset("coastalcph/tydi_xor_rc")
train_set = dataset["train"]
validation_set = dataset["validation"]

## Some statistics for a) is in the following blocks

In [6]:
#https://huggingface.co/docs/datasets/process

def get_lang(dataset, language):
    return dataset.filter(lambda sample : sample["lang"] == language)

def get_lang_length(dataset, language):
    return len(get_lang(dataset, language))

def get_answerable(dataset):
    return dataset.filter(lambda sample : sample["answerable"])

def get_unanswerable(dataset):
    return dataset.filter(lambda sample : not(sample["answerable"]))

In [7]:
#Get number of samples for each langugage for train and validation set
train_finnish = get_lang(train_set, "fi")
val_finnish = get_lang(train_set, "fi")

train_russian = get_lang(train_set, "ru")
val_russian = get_lang(train_set, "ru")

train_japan = get_lang(train_set, "ja")
val_japan = get_lang(train_set, "ja")

{("N_train_finnish", len(train_finnish)), ("N_val_finnish", len(val_finnish)), ("N_train_japan", len(train_japan)), ("N_val_japan", len(val_japan)),
 ("N_train_russian", len(train_russian)), ("N_val_russian", len(val_russian))}

{('N_train_finnish', 2126),
 ('N_train_japan', 2301),
 ('N_train_russian', 1983),
 ('N_val_finnish', 2126),
 ('N_val_japan', 2301),
 ('N_val_russian', 1983)}

In [8]:
#Get number of answerables for each language:
ans_train_finnish = len(get_answerable(train_finnish))
ans_val_finnish = len(get_answerable(val_finnish))

ans_train_russian = len(get_answerable(train_russian))
ans_val_russian = len(get_answerable(val_russian))

ans_train_japan = len(get_answerable(train_japan))
ans_val_japan = len(get_answerable(val_japan))

{("N_train_finnish_ans", ans_train_finnish), ("N_val_finnish_ans", ans_val_finnish), ("N_train_japan_ans", ans_train_japan), ("N_val_japan_ans", ans_val_japan),
 ("N_train_russian_ans", ans_train_russian), ("N_val_russian_ans", ans_val_russian)}

{('N_train_finnish_ans', 1872),
 ('N_train_japan_ans', 1929),
 ('N_train_russian_ans', 1756),
 ('N_val_finnish_ans', 1872),
 ('N_val_japan_ans', 1929),
 ('N_val_russian_ans', 1756)}

In [9]:
#Assert that answer for unanswered is Yes or No 

def get_invalids(dataset):
    unanswered_dataset = get_unanswerable(dataset)
    invalids = unanswered_dataset.filter(lambda sample : not(sample["answer"] == "no" or sample["answer"] == "yes"))
    return invalids

invalids = get_invalids(train_finnish)
print(len(invalids))
print(invalids["answer"])

#Some of the the questions that cannot be answered from the context has answers that is not "yes" or "no".

32
['15', '13000', 'Army and Air Force of the Russian Soviet Federative Socialist Republic', 'São Paulo', 'heavy rain', 'During the Three Kingdoms era', '1870', '1971', '15', '13000', 'Army and Air Force of the Russian Soviet Federative Socialist Republic', 'São Paulo', 'heavy rain', 'in November 2004', 'During the Three Kingdoms era', 'October 21, 1957', 'July 17, 1985', 'In the province of Varsinais-Suomen. It is 12 km from Masku to Raisio, 15 km to Naantali and 18 km to Turku', 'a government area surrounded by a castle', 'more than 55 million', '15', '13000', 'Jefferson Davis', 'transition series of elements', 'São Paulo', 'heavy rain', 'November 10, 1994', 'During the Three Kingdoms era', 'October 21, 1957', '1870', '1971', 'In the province of Varsinais-Suomen. It is 12 km from Masku to Raisio, 15 km to Naantali and 18 km to Turku']


In [10]:
#Assert that if answerable, then answer is a substring in context
def verify_answerable_is_substring_in_context(dataset):
    answered_dataset = get_answerable(dataset)
    return answered_dataset.filter(lambda sample : not(sample["answer"] in sample["context"]))

def verify_errors(invalids):
    return invalids.filter(lambda sample : not(sample["answerable"] == True and sample["answer_start"] == -1))

invalids = verify_answerable_is_substring_in_context(train_set)
undetected_errors = verify_errors(invalids)
print(invalids["answer_start"][1])
print(invalids["answerable"][1])
print(invalids["answer"][1])
print(invalids["context"][1])

#Note: Some questions are answerable but -1 in answer_start. Fix this by changing them to unanswerable

-1
True
1772
Lalan (Born: 1774 AD - Died: October 17, 1890 AD) was a Bengali with many talents; Who is also known as Fakir Lalan, Lalan Sai, Lalan Shah, Mahatma Lalan etc. He is a spiritual Baul saint, humanist, social reformer and philosopher. He was the lyricist, composer and singer of numerous songs. Lalan is considered one of the pioneers of Baul music and is also referred to as the 'Baul-Emperor'. It was through his songs that Baul song gained popularity in the 19th century. Lalan was a humanitarian saint. He who removed from all kinds of ethnic differences including religion, caste, gotra and gave humanity the highest place. He is from this non-sectarian attitude


In [11]:
train_df = train_set.to_pandas()
validation_df = validation_set.to_pandas()

In [12]:
train_df = train_df[train_df['lang'].isin(['fi', 'ja', 'ru'])]
validation_df = validation_df[validation_df['lang'].isin(['fi', 'ja', 'ru'])]

In [13]:
#For Cleaning

def clean_dataset(df):
    df.loc[(df["answerable"].isin([True])) & (df["answer_start"].isin([-1])), "answerable"] = False

def verify_cleaned_dataset(df):
    print(len(df[(df["answerable"].isin([True])) & (df["answer_start"].isin([-1]))]))


In [14]:
def print_num_of_answer_inlang(df):
    not_null_count = df['answer_inlang'].notna().sum()
    total_count = len(df)
    print(f"Number of rows where 'answer_inlang' is not null: {not_null_count} out of {total_count}")

In [15]:
print_num_of_answer_inlang(train_df)
print_num_of_answer_inlang(validation_df)

Number of rows where 'answer_inlang' is not null: 150 out of 6410
Number of rows where 'answer_inlang' is not null: 300 out of 1380


## Next Cells are for question b)

In [16]:
models = {
    'fi': (
        MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-fi-en'), 
        MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-fi-en')
    ),
    'ja': (
        MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-ja-en'), 
        MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ja-en')
    ),
    'ru': (
        MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-ru-en'), 
        MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ru-en')
    )
}

  return self.fget.__get__(instance, owner)()


In [17]:
def translate_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        translated = model.generate(**inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

In [18]:
def get_translated_questions(df, language):
    model, tokenizer = models[language]
    filtered_df = df[df['lang'] == language]
    translations = []
    for question in filtered_df['question'].dropna():
        translated_question = translate_text(question, model, tokenizer)
        translations.append(translated_question)
    return translations

In [19]:
def get_most_common_words(df, language):
    questions = ' '.join(get_translated_questions(df, language))
    words = word_tokenize(questions.lower())
    words = [word for word in words if word.isalnum()]
    word_counts = Counter(words)
    most_common_words = word_counts.most_common(5)
    return most_common_words

In [14]:
languages = ['fi', 'ja', 'ru']
for lang in languages:
    common_words = get_most_common_words(train_df, lang)
    print(f"Most common words in questions for language '{lang}':")
    for word, count in common_words:
        print(f"{word}: {count}")
    print()

Most common words in questions for language 'fi':
the: 1263
what: 772
was: 532
of: 462
in: 442



## Next is for Question c)

In [20]:

#TODO: Move translation to beginning of pipeline

def process_question(question, translater, tokenizer):
    translated = translate_text(question, translater, tokenizer)
    words = word_tokenize(translated.lower())
    words = [word for word in words if word.isalnum()]
    return words

def process_context(context):
    words = [word for word in context if word.isalnum()]
    return words

def question_in_context_freq(question_tokens, context_tokens):
    unique_question_tokens = set(question_tokens)
    unique_context_tokens = set(context_tokens)
    cnt = len(unique_question_tokens.intersection(unique_context_tokens))
    return cnt/len(unique_question_tokens)

def get_freqs(questions, contexts):
    pairs = zip(questions, contexts)
    freqs = []
    for pair in pairs:
        freqs.append(question_in_context_freq(pair[0],pair[1]))
    return freqs

df_train_finnish = train_df[train_df["lang"].isin(["fi"])]
translater = models["fi"][0]
tokenizer = models["fi"][1]

questions_processed = [process_question(question, translater, tokenizer) for question in df_train_finnish["question"]]
context_processed = [process_context(context) for context in df_train_finnish["context"]]
print(questions_processed)

KeyboardInterrupt: 