In [None]:
# !wget https://dl.fbaipublicfiles.com/XNLI/XNLI-15way.zip

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 KB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pyarrow>=12.0.0
  Downloading pyarrow-16.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 KB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pyarrow-hotfix
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Col

In [4]:
from datasets import load_dataset
import numpy as np
import pandas as pd


import torch
import torch.nn.functional as F

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM # XGLMTokenizer, XGLMForCausalLM,
from tqdm import tqdm

## Data prepocessing

#### Run only once


In [None]:

xnli_dataset = load_dataset("xnli", 'all_languages')

premise = xnli_dataset['train']['premise'][:220]
hypothesis = xnli_dataset['train']['hypothesis'][:220]
label = xnli_dataset['train']['label'][:220]


def get_parallel_corpus(l1 = "en", l2 = "fr", l3 = "ru"):

    premise_l1 = [i[l1] for i in premise]
    premise_l2 = [i[l2] for i in premise]
    premise_l3 = [i[l3] for i in premise]

    l1_idx = [i['language'].index(l1) for i in hypothesis]
    l2_idx = [i['language'].index(l2) for i in hypothesis]
    l3_idx = [i['language'].index(l3) for i in hypothesis]


    hypothesis_l1 = [i['translation'][idx] for i, idx in zip(hypothesis, l1_idx)]
    hypothesis_l2 = [i['translation'][idx] for i, idx in zip(hypothesis, l2_idx)]
    hypothesis_l3 = [i['translation'][idx] for i, idx in zip(hypothesis, l3_idx)]


    parallel_corpus = pd.DataFrame()

    parallel_corpus["premise_"+l1] = premise_l1
    parallel_corpus["premise_"+l2] = premise_l2
    parallel_corpus["premise_"+l3] = premise_l3

    parallel_corpus["hypothesis_"+l1] = hypothesis_l1
    parallel_corpus["hypothesis_"+l2] = hypothesis_l2
    parallel_corpus["hypothesis_"+l3] = hypothesis_l3

    parallel_corpus['label'] = label

    return parallel_corpus



parallel_corpus = get_parallel_corpus()
parallel_corpus.to_csv("./parallel_corpus_xnli.csv", index=False)



## Analysis 

In [5]:
parallel_corpus = pd.read_csv("./parallel_corpus_xnli.csv")

In [6]:
# 0 -> Entailment
# 1 -> Neutral
# 3 -> Contradiction

class args:
    max_new_tokens = 100
    temperature = 0.6
    device="cuda:0"
    l1 = "en"
    l2 = "fr"
    l3 = "ru" 

    
# Label Encoding and Decoding

label_encoding = {
    'en': {
        'label_encoder': {
            'true': 0,
            'inconclusive': 1,
            'false': 2
        },
        'label_decoder': {
            0: "true",
            1: "inconclusive",
            2: "false"
        }
    },
    'fr': {
        'label_encoder': {
            'vrai': 0,
            'peu concluant': 1,
            'faux': 2
        },
        'label_decoder': {
            0: "vrai",
            1: "peu concluant",
            2: "faux"
        }
    },
    'ru': {
        'label_encoder': {
            'истинный': 0,
            'неубедительный': 1,
            'ЛОЖЬ': 2
        },
        'label_decoder': {
            0: "истинный",
            1: "неубедительный",
            2: "ЛОЖЬ"
        }
    }
}



In [7]:
label = list(parallel_corpus['label'])

In [8]:
# https://huggingface.co/docs/transformers/en/main_classes/text_generation

def inference(prompt, max_new_tokens=args.max_new_tokens, temperature=args.temperature, output_logits=False):

    
    model_inputs = tokenizer([prompt], return_tensors="pt").to(args.device)

    # For removal of input string from the output string
    input_ids_cutoff = model_inputs.input_ids.size(dim=1)
    
    
    if output_logits:
        generated_ids = model.generate(**model_inputs,
                                       max_new_tokens=args.max_new_tokens,
                                       top_p = 1,
                                       temperature = args.temperature,
                                       return_dict_in_generate=True,
                                       output_logits=True,
                                       do_sample=True, 
                                       output_scores=True, 
                                       pad_token_id=tokenizer.eos_token_id)
        completion = tokenizer.decode(
                generated_ids['sequences'][0],
                skip_special_tokens=True)
        
    else:
        generated_ids = model.generate(**model_inputs,
                                       max_new_tokens=args.max_new_tokens,
                                       top_p = 1,
                                       temperature = args.temperature,
                                       do_sample=True, 
                                       pad_token_id=tokenizer.eos_token_id)
        completion = tokenizer.decode(
                generated_ids[0],
                skip_special_tokens=True)
    
#     print(generated_ids)
    

    
    return completion, generated_ids

In [13]:
def get_predictions(prompt, eval_lang):
    # Function which returns the predictions for a given language
    

    prediction = []

    for i in tqdm(range(200)):
        pre = parallel_corpus['premise_'+eval_lang][i]
        hyp = parallel_corpus['hypothesis_'+eval_lang][i]

        pred, generated_dict = inference(prompt.replace("premise", pre).replace("hypothesis", hyp))

        prediction.append(pred.lower())
        
    return prediction


In [14]:
def get_correct_pred(prediction, eval_lang):
    # Function to get the count of correct and irrelevant prediction
    
    
    # Count the number of correct instances 
    correct = 0

    # Count the number of instances where the model predicted irrelevant content 
    #or violated the given instruction in the prompt
    inconclusive = 0
    
    label_decoder = label_encoding[eval_lang]['label_decoder']
    label_encoder = label_encoding[eval_lang]['label_encoder']
    

    for i in range(len(prediction)):
        validation = ["Success" if k==prediction[i] else "Failure" for k in list(label_decoder.values()) ]

        if "Success" in validation:

            if label[i]==label_encoder[prediction[i]]:
                correct+=1
        else:
            inconclusive+=1
            
    return correct, inconclusive

    

In [15]:
# get_correct_pred(prediction)

In [16]:
idx = 216
num_samples = 3

def get_ic_examples(lang, idx=idx, num_samples=num_samples):
    """
    lang: language considered for extracting examples
    idx: Initial index
    num_samples: total samples

    Returns incontext examples for few shots experiments
    """
    label_decoder = label_encoding[lang]['label_decoder']
    
    pre = list(parallel_corpus.iloc[idx: idx+num_samples]['premise_'+lang])
    hyp = list(parallel_corpus.iloc[idx: idx+num_samples]['hypothesis_'+lang])
    lab = list(parallel_corpus.iloc[idx: idx+num_samples]['label'])
    
    lab = [label_decoder[l] for l in lab]

    ic_examples = ["{}\n{}\n{}\n".format(p, h, l) for p, h, l in zip(pre, hyp, lab)]
    
    return "\n".join(ic_examples)



In [17]:
prompt_en = """Take the following as truth: {{premise}}
Then the following statement: "{{hypothesis}}" is {{"true"}}, {{"false"}}, or
{{"inconclusive"}}?"""


prompt_fr = """
Considérez ce qui suit comme la vérité : {{premise}}
Ensuite, l'instruction suivante : "{{hypothesis}}" est {{"true"}}, {{"false"}} ou
{{"non concluant"}} ?
"""

prompt_ru = """
Примите за истину следующее: {{предпосылка}}
Тогда следующее утверждение: "{{гипотеза}}" является {{"истинным"}}, {{"ложным"}} или
{{"неубедительно"}}?
"""



incontext_en = prompt_en+"""

You can take the help from following example: 
{}
"""


incontext_fr = prompt_fr+"""

Vous pouvez vous aider de l'exemple suivant:
{}
""" 

incontext_ru = prompt_ru+"""

Вы можете воспользоваться помощью следующего примера: 
{}
""" 


# premise = "Mango is a fruit"
# hypothesis = "Mango is a mobile phone."
# inference(prompt_2.replace("premise", premise).replace("hypothesis", hypothesis))

# Base Model

In [13]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-base")
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-base")
model.to(args.device)

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 768)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 768)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
         

### Zero Shot Setup

### English

In [21]:
predictions_en = get_predictions(prompt_en, eval_lang=args.l1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:11<00:00, 16.70it/s]


In [26]:
correct, inconclusive = get_correct_pred(predictions_en, eval_lang=args.l1)
correct, inconclusive 

(83, 2)

### French

In [235]:
predictions_fr = get_predictions(prompt_fr, eval_lang=args.l2)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:20<00:00,  9.92it/s]


In [236]:
correct, inconclusive = get_correct_pred(predictions_fr, eval_lang=args.l1)
correct, inconclusive 

(82, 2)

### Russian

In [248]:
predictions_ru = get_predictions(prompt_ru, eval_lang=args.l3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:26<00:00,  2.30it/s]


In [613]:
correct, inconclusive = get_correct_pred(predictions_ru, eval_lang=args.l1)
correct, inconclusive 

(28, 120)

### Incontext setup

### Russian queries and English Eval

In [30]:
predictions_ru_en = get_predictions(incontext_ru.format(get_ic_examples(args.l3)), eval_lang=args.l1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:54<00:00,  3.66it/s]


In [31]:
correct, inconclusive = get_correct_pred(predictions_ru_en, eval_lang=args.l1)

In [32]:
correct, inconclusive

(8, 172)

### Russian queries and French Eval

In [33]:
predictions_ru_fr = get_predictions(incontext_ru.format(get_ic_examples(args.l3)), eval_lang=args.l2)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:22<00:00,  2.41it/s]


In [34]:
correct, inconclusive = get_correct_pred(predictions_ru_fr, eval_lang=args.l1)

In [35]:
correct, inconclusive

(12, 170)

### English query and French Eval

In [36]:
predictions_en_fr = get_predictions(incontext_en.format(get_ic_examples(args.l1)), eval_lang=args.l2)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:15<00:00, 12.78it/s]


In [37]:
correct, inconclusive = get_correct_pred(predictions_en_fr, eval_lang=args.l1)

In [38]:
correct, inconclusive

(77, 0)

### English query and Russian Eval

In [39]:
predictions_en_ru = get_predictions(incontext_en.format(get_ic_examples(args.l1)), eval_lang=args.l3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:09<00:00, 20.75it/s]


In [40]:
correct, inconclusive = get_correct_pred(predictions_en_ru, eval_lang=args.l1)

In [41]:
correct, inconclusive

(78, 1)

### French query and English Eval

In [42]:
predictions_fr_en = get_predictions(incontext_fr.format(get_ic_examples(args.l2)), eval_lang=args.l1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:09<00:00, 20.82it/s]


In [48]:
correct, inconclusive = get_correct_pred(predictions_fr_en, eval_lang=args.l1)

In [49]:
correct, inconclusive

(75, 2)

### French query and Russian Eval

In [45]:
predictions_fr_ru = get_predictions(incontext_fr.format(get_ic_examples(args.l2)), eval_lang=args.l3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:09<00:00, 20.80it/s]


In [46]:
correct, inconclusive = get_correct_pred(predictions_fr_ru, eval_lang=args.l1)

In [47]:
correct, inconclusive

(75, 2)

# Large Model

In [51]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large")
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-large")
model.to(args.device)

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 1024)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 1024)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=Fals

### Zero Shot Setup

### English

In [52]:
predictions_en = get_predictions(prompt_en, eval_lang=args.l1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:20<00:00,  9.65it/s]


In [53]:
correct, inconclusive = get_correct_pred(predictions_en, eval_lang=args.l1)
correct, inconclusive 

(90, 0)

### French

In [54]:
predictions_fr = get_predictions(prompt_fr, eval_lang=args.l2)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:20<00:00,  9.87it/s]


In [55]:
correct, inconclusive = get_correct_pred(predictions_fr, eval_lang=args.l1)
correct, inconclusive 

(95, 0)

### Russian

In [56]:
predictions_ru = get_predictions(prompt_ru, eval_lang=args.l3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:35<00:00,  5.64it/s]


In [58]:
correct, inconclusive = get_correct_pred(predictions_ru, eval_lang=args.l1)
correct, inconclusive 

(60, 35)

### Incontext setup

### Russian queries and English Eval

In [59]:
predictions_ru_en = get_predictions(incontext_ru.format(get_ic_examples(args.l3)), eval_lang=args.l1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:02<00:00,  3.21it/s]


In [63]:
# predictions_ru_en

In [60]:
correct, inconclusive = get_correct_pred(predictions_ru_en, eval_lang=args.l1)

In [61]:
correct, inconclusive

(24, 131)

### Russian queries and French Eval

In [64]:
predictions_ru_fr = get_predictions(incontext_ru.format(get_ic_examples(args.l3)), eval_lang=args.l2)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:50<00:00,  3.96it/s]


In [65]:
correct, inconclusive = get_correct_pred(predictions_ru_fr, eval_lang=args.l1)

In [66]:
correct, inconclusive

(26, 131)

### English query and French Eval

In [67]:
predictions_en_fr = get_predictions(incontext_en.format(get_ic_examples(args.l1)), eval_lang=args.l2)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:24<00:00,  8.16it/s]


In [68]:
correct, inconclusive = get_correct_pred(predictions_en_fr, eval_lang=args.l1)

In [69]:
correct, inconclusive

(88, 0)

### English query and Russian Eval

In [70]:
predictions_en_ru = get_predictions(incontext_en.format(get_ic_examples(args.l1)), eval_lang=args.l3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:17<00:00, 11.13it/s]


In [71]:
correct, inconclusive = get_correct_pred(predictions_en_ru, eval_lang=args.l1)

In [72]:
correct, inconclusive

(82, 0)

### French query and English Eval

In [79]:
predictions_fr_en = get_predictions(incontext_fr.format(get_ic_examples(args.l2)), eval_lang=args.l1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:27<00:00,  7.26it/s]


In [80]:
correct, inconclusive = get_correct_pred(predictions_fr_en, eval_lang=args.l1)

In [81]:
correct, inconclusive

(83, 1)

### French query and Russian Eval

In [85]:
predictions_fr_ru = get_predictions(incontext_fr.format(get_ic_examples(args.l2)), eval_lang=args.l3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:20<00:00,  9.55it/s]


In [86]:
correct, inconclusive = get_correct_pred(predictions_fr_ru, eval_lang=args.l1)

In [87]:
correct, inconclusive

(78, 4)

# Extra Large Model

In [18]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-xxl")
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-xxl")
model.to(args.device)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 4096)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 4096)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=4096, out_features=4096, bias=False)
              (k): Linear(in_features=4096, out_features=4096, bias=False)
              (v): Linear(in_features=4096, out_features=4096, bias=False)
              (o): Linear(in_features=4096, out_features=4096, bias=False)
              (relative_attention_bias): Embedding(32, 64)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=4096, out_features=10240, bias=False)
              (wi_1): Linear(in_features=4096, out_features=10240, bias=Fa

### Zero Shot Setup

### English

In [19]:
predictions_en = get_predictions(prompt_en, eval_lang=args.l1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:30<00:00,  6.63it/s]


In [20]:
correct, inconclusive = get_correct_pred(predictions_en, eval_lang=args.l1)
correct, inconclusive 

(141, 0)

### French

In [21]:
predictions_fr = get_predictions(prompt_fr, eval_lang=args.l2)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:32<00:00,  6.08it/s]


In [22]:
correct, inconclusive = get_correct_pred(predictions_fr, eval_lang=args.l1)
correct, inconclusive 

(123, 5)

### Russian

In [23]:
predictions_ru = get_predictions(prompt_ru, eval_lang=args.l3)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:27<00:00,  7.35it/s]


In [24]:
correct, inconclusive = get_correct_pred(predictions_ru, eval_lang=args.l1)
correct, inconclusive 

(47, 74)

### Incontext setup

### Russian queries and English Eval

In [25]:
predictions_ru_en = get_predictions(incontext_ru.format(get_ic_examples(args.l3)), eval_lang=args.l1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:53<00:00,  3.73it/s]


In [26]:
correct, inconclusive = get_correct_pred(predictions_ru_en, eval_lang=args.l1)

In [27]:
correct, inconclusive

(11, 170)

### Russian queries and French Eval

In [28]:
predictions_ru_fr = get_predictions(incontext_ru.format(get_ic_examples(args.l3)), eval_lang=args.l2)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:52<00:00,  3.84it/s]


In [29]:
correct, inconclusive = get_correct_pred(predictions_ru_fr, eval_lang=args.l1)

In [30]:
correct, inconclusive

(12, 159)

### English query and French Eval

In [31]:
predictions_en_fr = get_predictions(incontext_en.format(get_ic_examples(args.l1)), eval_lang=args.l2)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:51<00:00,  3.85it/s]


In [32]:
correct, inconclusive = get_correct_pred(predictions_en_fr, eval_lang=args.l1)

In [33]:
correct, inconclusive

(121, 0)

### English query and Russian Eval

In [34]:
predictions_en_ru = get_predictions(incontext_en.format(get_ic_examples(args.l1)), eval_lang=args.l3)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:50<00:00,  3.94it/s]


In [35]:
correct, inconclusive = get_correct_pred(predictions_en_ru, eval_lang=args.l1)

In [36]:
correct, inconclusive

(110, 0)

### French query and English Eval

In [37]:
predictions_fr_en = get_predictions(incontext_fr.format(get_ic_examples(args.l2)), eval_lang=args.l1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:58<00:00,  3.39it/s]


In [38]:
correct, inconclusive = get_correct_pred(predictions_fr_en, eval_lang=args.l1)

In [39]:
correct, inconclusive

(94, 56)

### French query and Russian Eval

In [40]:
predictions_fr_ru = get_predictions(incontext_fr.format(get_ic_examples(args.l2)), eval_lang=args.l3)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:00<00:00,  3.28it/s]


In [41]:
correct, inconclusive = get_correct_pred(predictions_fr_ru, eval_lang=args.l1)

In [42]:
correct, inconclusive

(80, 60)