In [None]:
!pip -q install deep_translator
!pip -q install faiss-gpu
!pip -q install ipywidgets
!pip install -q transformers==4.34.0 datasets==2.14.5 accelerate==0.23.0 evaluate==0.4.1 peft==0.5.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
from deep_translator import GoogleTranslator
translator = GoogleTranslator(source='fr', target='en')
from datasets import load_dataset
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

In [None]:
import ast
import torch
import pandas as pd
from tqdm import tqdm

In [None]:
#FrenchMedMCQA
!unzip ./original_dataset/FrenchMedMCQA.zip

Archive:  FrenchMedMCQA.zip
  inflating: FrenchMedMCQA/test.json  
  inflating: FrenchMedMCQA/dev.json  
  inflating: FrenchMedMCQA/train.json  


In [None]:
#Translation
def translate_fren_eng(file_path):
  en_dataset = []
  dataset = pd.read_json(file_path)
  for index, row in tqdm(dataset.iterrows()):
      question = translator.translate(row["question"])
      answers = row["answers"]
      new_answers = {}
      for choice, answer in answers.items():
        new_answers[choice] = translator.translate(answer)
      data = {'id': row["id"],
              'question': question,
              'answers': new_answers,
              'correct_answers': row["correct_answers"],
              'subject_name': row["subject_name"],
              'nbr_correct_answers': row["nbr_correct_answers"]}
      en_dataset.append(data)
  return pd.DataFrame(en_dataset)

In [None]:
en_train_df = translate_fren_eng('./FrenchMedMCQA/train.json')
en_train_df.to_csv('./FrenchMedMCQA_en_train.csv', index=False)
en_test_df = translate_fren_eng('./FrenchMedMCQA/test.json')
en_test_df.to_csv('./FrenchMedMCQA_en_test.csv', index=False)
en_devtset_df = translate_fren_eng('./FrenchMedMCQA/dev.json')
en_devtset_df.to_csv('./FrenchMedMCQA_en_devtset.csv', index=False)

In [None]:
#Download wiki dataset
wiki_dataset = load_dataset("wiki_dpr", cache_dir="./wiki_dpr/dataset_cache/", data_dir="./wiki_dpr/dataset_cache/", with_index=True)

In [None]:
#Context retrieval
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

def retrieve_topk(question,k=20):
    question_embedding = q_encoder(**q_tokenizer(question, return_tensors="pt"))[0][0].detach().numpy()
    scores, retrieved_examples = wiki_dataset['train'].get_nearest_examples('embeddings', question_embedding, k=k)
    return retrieved_examples['text'],scores

def search_context(df):
    import ast
    contexts = []
    for i in range(len(df)):
      ques = df.iloc[i]['question'].lower()
      if ques == "which of the following is/are correct?":
        opts = ast.literal_eval(df.iloc[i]['answers'])
        text = '. '.join([ques] + list(opts.values()))
        context = '. '.join(retrieve_topk(text,k=5)[0])
      else:
        context = '. '.join(retrieve_topk(ques,k=5)[0])
      contexts.append(context)
    df['context'] = contexts
    return df

In [None]:
df = pd.read_csv('./FrenchMedMCQA_en_train.csv')
search_context(df).to_csv('./FrenchMedMCQA_context_en_train.csv', index = False)
df = pd.read_csv('./FrenchMedMCQA_en_test.csv')
search_context(df).to_csv('./FrenchMedMCQA_context_en_test.csv', index = False)
df = pd.read_csv('./FrenchMedMCQA_en_devtset.csv')
search_context(df).to_csv('./FrenchMedMCQA_context_en_devtset.csv', index = False)

In [None]:
def id_labeling(num_opts):
  option_dict = {2: ['a', 'b'],
                 3: ['a', 'b', 'c','ab', 'ac', 'bc','abc'],
                 4: ['a', 'b', 'c', 'd', 'ab', 'ac', 'ad', 'bc', 'bd', 'cd','abc', 'abd', 'acd', 'bcd','abcd'],
                 5: ['a', 'b', 'c', 'd', 'e', 'ab', 'ac', 'ad', 'ae', 'bc', 'bd', 'be', 'cd', 'ce', 'de', 'abc', 'abd', 'abe', 'acd',
                     'ace', 'ade', 'bcd', 'bce', 'bde', 'cde', 'abcd', 'abce', 'abde', 'acde', 'bcde','abcde']}
  if num_opts in [2,3,4,5]:
    label_list = option_dict[num_opts]
    id2label = {}
    label2id = {}
    for idx, label in enumerate(label_list):
      id2label[idx] = label
      label2id[label] = idx
    num_labels = len(id2label)
    return id2label, label2id, num_labels

def labeling(label):
  id2label, label2id, num_labels = id_labeling(5)
  correct_answers = ','.join(ast.literal_eval(label))
  return label2id["".join(sorted(','.join(ast.literal_eval(label)).split(",")))]+1

def data_prepare(filepath):
  df = split_answers(pd.read_csv(filepath))
  df['cop'] = df['correct_answers'].apply(labeling)
  new_df = df[['id', 'question', 'correct_answers','cop','answers.a',	'answers.b',	'answers.c',	'answers.d',	'answers.e',	'context']]
  new_df.columns = ['id', 'question', 'correct_answers','cop','opa',	'opb',	'opc',	'opd',	'ope', 'exp']
  return new_df

In [None]:
#Splitting answers
def split_answers(df):
  data = pd.DataFrame(columns=['a','b','c','d','e'])
  for index, row in df.iterrows():
    data = pd.concat([data, pd.DataFrame([ast.literal_eval(row['answers'])])], ignore_index=True, axis=0)
  data.columns = ['answers.a', 'answers.b', 'answers.c', 'answers.d', 'answers.e']
  new_df = pd.concat([df,data], axis=1)
  new_df['cop'] = new_df['correct_answers'].apply(labeling)
  columns = ['id',	'question',	'answers.a',	'answers.b',	'answers.c',	'answers.d',	'answers.e', 'context', 'cop', 'correct_answers']
  return new_df[columns]

In [None]:
import pandas as pd
df = pd.read_csv('./FrenchMedMCQA_en_context_dataset/FrenchMedMCQA_context_en_devtset.csv')
split_answers(df).to_csv('./FrenchMedMCQA_modified_devtset.csv', index = False)
df = pd.read_csv('./FrenchMedMCQA_en_context_dataset/FrenchMedMCQA_context_en_test.csv')
split_answers(df).to_csv('./FrenchMedMCQA_modified_test.csv', index = False)
df = pd.read_csv('./FrenchMedMCQA_en_context_dataset/FrenchMedMCQA_context_en_train.csv')
split_answers(df).to_csv('./FrenchMedMCQA_modified_train.csv', index = False)