In [1]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


Load and remove cancelled questions

In [52]:
enem = {}
enem["2022"] = load_dataset("maritaca-ai/enem", "2022", split="train").filter(lambda x: x["label"].lower() != "anulado")
enem["2023"] = load_dataset("maritaca-ai/enem", "2023", split="train").filter(lambda x: x["label"].lower() != "anulado")
enem["2024"] = load_dataset("maritaca-ai/enem", "2024", split="train").filter(lambda x: x["label"].lower() != "anulado")

Add reasoning column to train dataset - 2024 and 2023 exams

In [53]:
reason2022 = pd.read_csv('enem_reason_2022.csv')['reasoning'].to_list()
reason2023 = pd.read_csv('enem_reason_2023.csv')['reasoning'].to_list()
enem["2022"] = enem["2022"].add_column("reasoning", reason2022)
enem["2023"] = enem["2023"].add_column("reasoning", reason2023)

In [54]:
def enem_chat_template(x, source="exam", metadata_col=None, reason_col=None, label_col="label"):
	label_to_id = {
		"A": 0,
		"B": 1,
		"C": 2,
		"D": 3,
  		"E": 4
	}
	id_to_label = {v: k for k, v in label_to_id.items()}
	alternatives = [f"\n{id_to_label[i]} - {a}" for i,a in enumerate(x["alternatives"])]
	human = x["question"] + "\n" + "\n".join(x["description"]) + "\n\n"  + "Alternativas:" + "".join(alternatives) 
	if reason_col:
		reasoning = x[reason_col]
		assistant = f"{x['label']} - {x['alternatives'][label_to_id[x['label']]]}\n\nRaciocínio: {reasoning}"
	else:
		assistant = f"{x["label"]} - {x["alternatives"][label_to_id[x["label"]]]}" 
	
	return {"conversations": [{"role":"human", "content": human},
							  {"role":"assistant", "content": assistant}
							],
			"source": year,
			"metadata": x[metadata_col] if metadata_col else source,
			"label_raw": x[label_col]
			}

In [55]:
year_trains = ['2022', '2023']
train_ds = []
train_ds_reason = []
for year in year_trains:
	train_ds.append(enem[year].map(enem_chat_template, remove_columns=enem[year].column_names))
	train_ds_reason.append(enem[year].map(lambda x: enem_chat_template(x, reason_col="reasoning"), remove_columns=enem[year].column_names))

Map: 100%|██████████| 179/179 [00:00<00:00, 8492.61 examples/s]
Map: 100%|██████████| 179/179 [00:00<00:00, 7754.71 examples/s]
Map: 100%|██████████| 179/179 [00:00<00:00, 9139.14 examples/s]
Map: 100%|██████████| 179/179 [00:00<00:00, 8040.74 examples/s]


In [56]:
enem["2024"] = enem["2024"].map(enem_chat_template, remove_columns=enem["2024"].column_names)
enem["2024"]

Map: 100%|██████████| 179/179 [00:00<00:00, 6961.02 examples/s]


Dataset({
    features: ['conversations', 'source', 'metadata', 'label_raw'],
    num_rows: 179
})

In [57]:
enem["2024"][0]

{'conversations': [{'content': '## Holy War\nOh, so we can hate each other and fear each other\nWe can build these walls between each other Baby, blow by blow and brick by brick Keep yourself locked in, yourself locked in\n[…]\nOh, maybe we should love somebody\nOh, maybe we could care a little more\nSo maybe we should love somebody\nInstead of polishing the bombs of holy war\nNessa letra de canção, de Alicia Keys, que aborda um contexto de ódio e intolerância, o marcador “instead of ” introduz a ideia de\n\n\nAlternativas:\nA - mudança de comportamento.\nB - panorama de conflitos.\nC - rotina de isolamento.\nD - perspectiva bélica.\nE - cenário religioso.',
   'role': 'human'},
  {'content': 'A - mudança de comportamento.', 'role': 'assistant'}],
 'source': '2023',
 'metadata': 'exam',
 'label_raw': 'A'}

In [None]:
from datasets import concatenate_datasets

concatenate_datasets(train_ds).save_to_disk("data/enem_train")
concatenate_datasets(train_ds_reason).save_to_disk("data/enem_train_reason")
enem["2024"].save_to_disk("data/enem_2024")


Saving the dataset (1/1 shards): 100%|██████████| 358/358 [00:00<00:00, 90129.70 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 358/358 [00:00<00:00, 84480.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 179/179 [00:00<00:00, 52812.35 examples/s]
