In [17]:
import json
import pandas as pd

def data_prepare(filepath):
  df = pd.read_json(filepath, lines=True)
  df['ope'] = ''
  new_df = df[['id', 'question', 'cop','opa',	'opb',	'opc', 'opd', 'ope', 'exp']]
  new_df.columns = ['id', 'question', 'cop', 'answers.a', 'answers.b', 'answers.c', 'answers.d', 'answers.e', 'context']
  return new_df

def generate_examples(filepath):
  BERT_CLS = "[CLS]"
  BERT_BOS = ""
  BERT_SEP = "[SEP]"
  BERT_EOS = ""
  df = pd.read_csv(filepath, low_memory=False)
  for key, d in df.iterrows():
      if str(d["context"]) == "nan":
          d["context"] = "None"
      opt_lst = [op for op in d[['answers.a', 'answers.b', 'answers.c', 'answers.d', 'answers.e']].tolist() if str(op) != 'nan']
      bert_ctx = BERT_CLS + " " + d["question"] + f" {BERT_SEP} " + f" {BERT_SEP} ".join(opt_lst) + " " + BERT_SEP + d["context"] + BERT_EOS
      yield key, {"id": d["id"],
                  "question": d["question"],
                  "answer_a": str(d["answers.a"]),
                  "answer_b": str(d["answers.b"]),
                  "answer_c": str(d["answers.c"]),
                  "answer_d": str(d["answers.d"]),
                  "answer_e": str(d["answers.e"]),
                  "label": d['cop']-1,
                  "context": d["context"],
                  "bert_text": bert_ctx,
                  }

In [None]:
#FrenchMCQA
!unzip ./modified_dataset/FrenchMedMCQA_modified_dataset.zip
#MedMCQA
!unzip ./original_dataset/MedMCQA.zip

In [18]:
#Training set
MedMCQA_train = data_prepare('./train.json')
FrenchMCQA_train = pd.read_csv('./FrenchMedMCQA_modified_dataset/FrenchMedMCQA_modified_train.csv')
FrenchMCQA_test = pd.read_csv('./FrenchMedMCQA_modified_dataset/FrenchMedMCQA_modified_test.csv')
French_test = FrenchMCQA_test[['id', 'question', 'cop', 'answers.a', 'answers.b', 'answers.c', 'answers.d', 'answers.e', 'context']]
French_train = FrenchMCQA_train[['id', 'question', 'cop', 'answers.a', 'answers.b', 'answers.c', 'answers.d', 'answers.e', 'context']]
FrenchMCQA_data = pd.concat([French_train, French_test], ignore_index=True, axis=0)
pd.concat([MedMCQA_train, FrenchMCQA_data], ignore_index=True, axis=0).to_csv('MCQA_train_set.csv', index = False)

with open("./MCQA_train.json", "w") as f:
  for i in generate_examples('./MCQA_train_set.csv'):
    json.dump(i[1], f)
    f.write('\n')

In [19]:
#Valid set
MedMCQA_dev = data_prepare('./dev.json')
FrenchMCQA_dev = pd.read_csv('./FrenchMedMCQA_modified_dataset/FrenchMedMCQA_modified_devtset.csv')
pd.concat([MedMCQA_dev,FrenchMCQA_dev[['id', 'question', 'cop', 'answers.a', 'answers.b', 'answers.c', 'answers.d', 'answers.e', 'context']]],
          ignore_index=True, axis=0).to_csv('MCQA_dev_set.csv', index = False)

with open("./MCQA_dev.json", "w") as f:
  for i in generate_examples('./MCQA_dev_set.csv'):
    json.dump(i[1], f)
    f.write('\n')