# Imports

In [1]:
import numpy as np
import pandas as pd

# Get datasets

In [2]:
# # Download data from previous tests from Maritaca website
# # If its the first time running, this will create the folder for you

# # Download data from 2022 into folder datasets
# !wget -cP ./datasets https://huggingface.co/datasets/maritaca-ai/enem/resolve/main/2022.jsonl
# # Download data from 2023 into folder datasets
# !wget -cP ./datasets https://huggingface.co/datasets/maritaca-ai/enem/resolve/main/2023.jsonl

In [3]:
# # Download microdata from Enem website 2023 and unzip
# !wget -cP ./datasets https://download.inep.gov.br/microdados/microdados_enem_2023.zip
# !unzip datasets/microdados_enem_2023.zip -d ./datasets/2023

# # Download microdata from Enem website 2022 and unzip
# !wget -cP ./datasetshttps://download.inep.gov.br/microdados/microdados_enem_2022.zip
# !unzip datasets/microdados_enem_2022.zip -d ./datasets/2022

# Clean dataset

## Load questions

In [4]:
# Load data from 2022 and 2023 then concat into a unique dataframe
df_questions_2022 = pd.read_json("./datasets/2022.jsonl", lines=True)
df_questions_2023 = pd.read_json("./datasets/2023.jsonl", lines=True)
df_questions = pd.concat([df_questions_2022, df_questions_2023], axis=0)
df_questions.head()

Unnamed: 0,id,exam,IU,ledor,question,description,alternatives,label,figures
0,questao_01,2022,True,True,"[[placeholder]]\nNessa tirinha, o comportament...",[Descrição da imagem: Tirinha apresentada em q...,"[revolta com a falta de sorte., gosto pela prá...",B,[https://raw.githubusercontent.com/piresramon/...
1,questao_02,2022,False,True,## A Teen’s View of Social Media\nInstagram is...,[],"[oferecer recursos de fotografia., divulgar pr...",D,[]
2,questao_03,2022,False,True,I tend the mobile now like an injured bird\nWe...,[],"[contentamento com a interação virtual., zelo ...",E,[]
3,questao_04,2022,False,True,"Two hundred years ago, Jane Austen lived in a ...",[],[problematizar o papel de gênero em casamentos...,C,[]
4,questao_05,2022,False,True,"As my official bio reads, I was made in Cuba, ...",[],"[qualidade da educação formal em Miami., prest...",D,[]


In [5]:
# Get all data types
df_questions.dtypes

id              object
exam             int64
IU                bool
ledor             bool
question        object
description     object
alternatives    object
label           object
figures         object
dtype: object

In [6]:
# Get statistical data from numeric values
df_questions.describe()


Unnamed: 0,exam
count,360.0
mean,2022.5
std,0.500696
min,2022.0
25%,2022.0
50%,2022.5
75%,2023.0
max,2023.0


In [7]:
# Create a boolean column to indicate if it has figures or not
df_questions['has_figures'] = np.where( df_questions['figures'].str.len() > 0, True, False)

In [8]:
# Drop unecessary columns
df_questions.drop(['IU', 'ledor', 'figures'], axis=1)

Unnamed: 0,id,exam,question,description,alternatives,label,has_figures
0,questao_01,2022,"[[placeholder]]\nNessa tirinha, o comportament...",[Descrição da imagem: Tirinha apresentada em q...,"[revolta com a falta de sorte., gosto pela prá...",B,True
1,questao_02,2022,## A Teen’s View of Social Media\nInstagram is...,[],"[oferecer recursos de fotografia., divulgar pr...",D,False
2,questao_03,2022,I tend the mobile now like an injured bird\nWe...,[],"[contentamento com a interação virtual., zelo ...",E,False
3,questao_04,2022,"Two hundred years ago, Jane Austen lived in a ...",[],[problematizar o papel de gênero em casamentos...,C,False
4,questao_05,2022,"As my official bio reads, I was made in Cuba, ...",[],"[qualidade da educação formal em Miami., prest...",D,False
...,...,...,...,...,...,...,...
175,questao_176,2023,O mastro de uma bandeira foi instalado perpend...,[Descrição da figura: A figura representa o ma...,"[22 x sqrt(3)/3., 11 x sqrt(2)., 12 x sqrt(2)....",C,True
176,questao_177,2023,Um controlador de voo dispõe de um instrumento...,"[Descrição do gráfico: Gráfico cartesiano, em ...","[1 : 5., 1 : 11., 1 : 55., 1 : 5 000., 1 : 500...",E,True
177,questao_178,2023,O calendário maia apresenta duas contagens sim...,[],"[741, 1 040, 1 460, 2 100, 5 200]",C,False
178,questao_179,2023,"Sejam a, b e c as medidas dos lados de um triâ...",[Descrição da figura: A figura apresenta um tr...,"[0 grau < alfa < 90 graus, alfa = 90 graus., 9...",C,True


## Load Enem Microdata

In [9]:
def load_huge_csv(filename):
    # the number of row in each data frame
    # you can put any value here according to your situation
    chunksize = 1000

    # the list that contains all the dataframes
    list_of_dataframes = []    

    for df in pd.read_csv(filename, chunksize=chunksize, encoding="ISO-8859-1", sep=";"):
        # process your data frame here
        # then add the current data frame into the list
        list_of_dataframes.append(df)

    # if you want all the dataframes together, here it is
    return pd.concat(list_of_dataframes)

In [None]:
df_micro_2022 = load_huge_csv("./datasets/2023/DADOS/MICRODADOS_ENEM_2023.csv")
df_micro_2023 = load_huge_csv("./datasets/2022/DADOS/MICRODADOS_ENEM_2022.csv")
df_micro = pd.concat([df_micro_2022, df_micro_2023], axis=0)
df_micro.head()