# Imports

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Set limite of visible columns to None
pd.set_option('display.max_columns', None)

# Get datasets

In [3]:
# # Download data from previous tests from Maritaca website
# # If its the first time running, this will create the folder for you

# # Download data from 2022 into folder datasets
# !wget -cP ./datasets https://huggingface.co/datasets/maritaca-ai/enem/resolve/main/2022.jsonl
# # Download data from 2023 into folder datasets
# !wget -cP ./datasets https://huggingface.co/datasets/maritaca-ai/enem/resolve/main/2023.jsonl

In [4]:
# # Download microdata from Enem website 2023 and unzip
# !wget -cP ./datasets https://download.inep.gov.br/microdados/microdados_enem_2023.zip
# !unzip datasets/microdados_enem_2023.zip -d ./datasets/2023

# # Download microdata from Enem website 2022 and unzip
# !wget -cP ./datasetshttps://download.inep.gov.br/microdados/microdados_enem_2022.zip
# !unzip datasets/microdados_enem_2022.zip -d ./datasets/2022

# Clean dataset

## Load questions

In [5]:
# Load data from 2022 and 2023 then concat into a unique dataframe
df_questions_2022 = pd.read_json("./datasets/2022.jsonl", lines=True)
df_questions_2023 = pd.read_json("./datasets/2023.jsonl", lines=True)
df_questions = pd.concat([df_questions_2022, df_questions_2023], axis=0)
df_questions.head()

Unnamed: 0,id,exam,IU,ledor,question,description,alternatives,label,figures
0,questao_01,2022,True,True,"[[placeholder]]\nNessa tirinha, o comportament...",[Descrição da imagem: Tirinha apresentada em q...,"[revolta com a falta de sorte., gosto pela prá...",B,[https://raw.githubusercontent.com/piresramon/...
1,questao_02,2022,False,True,## A Teen’s View of Social Media\nInstagram is...,[],"[oferecer recursos de fotografia., divulgar pr...",D,[]
2,questao_03,2022,False,True,I tend the mobile now like an injured bird\nWe...,[],"[contentamento com a interação virtual., zelo ...",E,[]
3,questao_04,2022,False,True,"Two hundred years ago, Jane Austen lived in a ...",[],[problematizar o papel de gênero em casamentos...,C,[]
4,questao_05,2022,False,True,"As my official bio reads, I was made in Cuba, ...",[],"[qualidade da educação formal em Miami., prest...",D,[]


In [6]:
# Get all data types
df_questions.dtypes

id              object
exam             int64
IU                bool
ledor             bool
question        object
description     object
alternatives    object
label           object
figures         object
dtype: object

In [7]:
# Get statistical data from numeric values
df_questions.describe()


Unnamed: 0,exam
count,360.0
mean,2022.5
std,0.500696
min,2022.0
25%,2022.0
50%,2022.5
75%,2023.0
max,2023.0


In [8]:
# Create a boolean column to indicate if it has figures or not
df_questions['has_figures'] = np.where( df_questions['figures'].str.len() > 0, True, False)

In [9]:
# Drop unecessary columns
df_questions.drop(['IU', 'ledor', 'figures'], axis=1)

Unnamed: 0,id,exam,question,description,alternatives,label,has_figures
0,questao_01,2022,"[[placeholder]]\nNessa tirinha, o comportament...",[Descrição da imagem: Tirinha apresentada em q...,"[revolta com a falta de sorte., gosto pela prá...",B,True
1,questao_02,2022,## A Teen’s View of Social Media\nInstagram is...,[],"[oferecer recursos de fotografia., divulgar pr...",D,False
2,questao_03,2022,I tend the mobile now like an injured bird\nWe...,[],"[contentamento com a interação virtual., zelo ...",E,False
3,questao_04,2022,"Two hundred years ago, Jane Austen lived in a ...",[],[problematizar o papel de gênero em casamentos...,C,False
4,questao_05,2022,"As my official bio reads, I was made in Cuba, ...",[],"[qualidade da educação formal em Miami., prest...",D,False
...,...,...,...,...,...,...,...
175,questao_176,2023,O mastro de uma bandeira foi instalado perpend...,[Descrição da figura: A figura representa o ma...,"[22 x sqrt(3)/3., 11 x sqrt(2)., 12 x sqrt(2)....",C,True
176,questao_177,2023,Um controlador de voo dispõe de um instrumento...,"[Descrição do gráfico: Gráfico cartesiano, em ...","[1 : 5., 1 : 11., 1 : 55., 1 : 5 000., 1 : 500...",E,True
177,questao_178,2023,O calendário maia apresenta duas contagens sim...,[],"[741, 1 040, 1 460, 2 100, 5 200]",C,False
178,questao_179,2023,"Sejam a, b e c as medidas dos lados de um triâ...",[Descrição da figura: A figura apresenta um tr...,"[0 grau < alfa < 90 graus, alfa = 90 graus., 9...",C,True


## Load Enem Microdata

In [10]:
def load_huge_csv(filename):
    # the number of row in each data frame
    # you can put any value here according to your situation
    chunksize = 1000

    # the list that contains all the dataframes
    list_of_dataframes = []    

    for df in pd.read_csv(filename, chunksize=chunksize, encoding="ISO-8859-1", sep=";"):
        # process your data frame here
        # then add the current data frame into the list
        list_of_dataframes.append(df)

    # if you want all the dataframes together, here it is
    return pd.concat(list_of_dataframes)

In [11]:
df_micro_2022 = load_huge_csv("./datasets/2022/DADOS/MICRODADOS_ENEM_2022.csv")
# df_micro_2023 = load_huge_csv("./datasets/2023/DADOS/MICRODADOS_ENEM_2023.csv")
# df_micro = pd.concat([df_micro_2022, df_micro_2023], axis=0)
# df_micro.head()

In [15]:
df_micro_2022[(df_micro_2022['CO_PROVA_CN'] == 1087) & 
              (df_micro_2022['CO_PROVA_CH'] == 1057) & 
              (df_micro_2022['CO_PROVA_LC'] == 1068) & 
              (df_micro_2022['CO_PROVA_MT'] == 1078)].head()

Unnamed: 0,NU_INSCRICAO,NU_ANO,TP_FAIXA_ETARIA,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ENSINO,IN_TREINEIRO,CO_MUNICIPIO_ESC,NO_MUNICIPIO_ESC,CO_UF_ESC,SG_UF_ESC,TP_DEPENDENCIA_ADM_ESC,TP_LOCALIZACAO_ESC,TP_SIT_FUNC_ESC,CO_MUNICIPIO_PROVA,NO_MUNICIPIO_PROVA,CO_UF_PROVA,SG_UF_PROVA,TP_PRESENCA_CN,TP_PRESENCA_CH,TP_PRESENCA_LC,TP_PRESENCA_MT,CO_PROVA_CN,CO_PROVA_CH,CO_PROVA_LC,CO_PROVA_MT,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,TX_RESPOSTAS_CN,TX_RESPOSTAS_CH,TX_RESPOSTAS_LC,TX_RESPOSTAS_MT,TP_LINGUA,TX_GABARITO_CN,TX_GABARITO_CH,TX_GABARITO_LC,TX_GABARITO_MT,TP_STATUS_REDACAO,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO,Q001,Q002,Q003,Q004,Q005,Q006,Q007,Q008,Q009,Q010,Q011,Q012,Q013,Q014,Q015,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
303,210054616419,2022,4,F,1,2,1,1,1,1,,0,,,,,,,,1501402,Belém,15,PA,1,1,1,1,1087.0,1057.0,1068.0,1078.0,474.0,503.8,529.0,521.9,DAAAABAEEAABDCCBCAEBCDDCDEBEBEADBADBEECCAACAA,BABCEBEBDDEEDEDDABAEBADDEACCEAEBCBDAAAAAECCAC,AACAECCCDADAEDCBDABBBEDEBAEABCCBEADBAADDCECAE,DEEAECABABCCECCCCADDEEABCBCBDAABBABDABDEDBBCC,1,DDECDBEACCAEBEAEBBCCDDCBDDACBEACEABCEABEDADBA,CBECABBCDEACDCEDAAADBCDDAECBABAECBEBAEEDADEAB,BDECDAADCECCCDCCEBACBBAAAEECABABEECEBEEDCADBDB...,BEEDAEABDDCEBDBAAAAACXCBCCCBCCDBDEECBDCABEECD,1.0,100.0,200.0,120.0,160.0,180.0,760.0,E,C,A,B,5,C,A,B,C,A,B,B,A,B,A,A,A,A,B,B,A,C,A,A,B
1202,210054729170,2022,8,F,1,1,1,1,1,1,,0,,,,,,,,1500800,Ananindeua,15,PA,1,1,1,1,1087.0,1057.0,1068.0,1078.0,375.9,509.7,305.0,376.4,ABBEACDBEBBACBBCECBEDDBACEACCCDBBDDBEEBADEBCE,DEECBEACDDDDECEDACEEEAADBBCAABCEADDDAAEDDCAAC,CCCCCBABBCCBEDDDECCCABACBDAABAEAACDBEEEACADAC,DEBCDEBCADCDEDDBDDEDDDBABCCBDADDEDDECBEDBCADB,1,DDECDBEACCAEBEAEBBCCDDCBDDACBEACEABCEABEDADBA,CBECABBCDEACDCEDAAADBCDDAECBABAECBEBAEEDADEAB,BDECDAADCECCCDCCEBACBBAAAEECABABEECEBEEDCADBDB...,BEEDAEABDDCEBDBAAAAACXCBCCCBCCDBDEECBDCABEECD,1.0,80.0,120.0,100.0,80.0,20.0,400.0,E,E,F,F,5,B,B,B,C,A,A,B,A,B,B,A,A,A,C,A,A,D,A,B,B
2130,210056828156,2022,8,M,1,1,1,1,5,1,,0,,,,,,,,2806701,São Cristóvão,28,SE,1,1,1,1,1087.0,1057.0,1068.0,1078.0,508.7,628.3,594.5,568.0,DCACEBEBBAEADAEBBEEAAEECDCAECDBDDACDDADBDADCD,ACECECECDCACDCEDEAADBCDDBECECBAEBCDEAEEDAAEAB,AACCEBCCDCDCBBCBBABAEEAABABDECEBEAAACDBDDDAAD,BDEBEDDCDBBDDDDCCABCDEDACBDBCBBDBDECBBCCDEBDD,1,DDECDBEACCAEBEAEBBCCDDCBDDACBEACEABCEABEDADBA,CBECABBCDEACDCEDAAADBCDDAECBABAECBEBAEEDADEAB,BDECDAADCECCCDCCEBACBBAAAEECABABEECEBEEDCADBDB...,BEEDAEABDDCEBDBAAAAACXCBCCCBCCDBDEECBDCABEECD,1.0,160.0,200.0,140.0,200.0,200.0,900.0,C,D,B,A,4,C,A,B,C,A,A,B,A,A,A,A,A,A,A,A,A,B,A,A,A
3809,210056712151,2022,15,F,1,1,1,1,0,1,,0,,,,,,,,2910727,Eunápolis,29,BA,1,1,1,1,1087.0,1057.0,1068.0,1078.0,448.5,490.3,450.3,552.2,AADBCAAAADACEBEAAAECAEBDBDEAEEBEEBEEBBEDDEDED,DBDCEDAEADDEBBEADAEEBAEEEBDCBBDDBBDEB.DDADEAA,AACCEBBDBACCCACEAECCAEBBBAAACBEEECCBACBCEEAAB,BEEAECCDBCCECDAAAAEAADCBEABBEBCAAACBBDCBA*BAA,1,DDECDBEACCAEBEAEBBCCDDCBDDACBEACEABCEABEDADBA,CBECABBCDEACDCEDAAADBCDDAECBABAECBEBAEEDADEAB,BDECDAADCECCCDCCEBACBBAAAEECABABEECEBEEDCADBDB...,BEEDAEABDDCEBDBAAAAACXCBCCCBCCDBDEECBDCABEECD,1.0,80.0,120.0,120.0,120.0,100.0,540.0,B,A,A,A,2,B,A,B,B,A,A,B,A,A,A,A,A,A,B,A,A,C,A,A,B
4036,210055864590,2022,12,M,1,1,1,1,16,1,,0,,,,,,,,3550308,São Paulo,35,SP,1,1,1,1,1087.0,1057.0,1068.0,1078.0,478.0,412.6,457.0,429.9,ACEDAAACBDACECCDDBDABCADBDAEBDACEBDACEBDACBEB,BBCEDACBDCBADCEDEADCBECADCABEABCBEABEBADAEBEB,BBDCACCBDEADEECBBCBADBABDBCAABCEBBABCEDABCBAC,DACEBDABCDEBEDDCAEBEEDACEACAECBDABCDEACEBDECD,0,DDECDBEACCAEBEAEBBCCDDCBDDACBEACEABCEABEDADBA,CBECABBCDEACDCEDAAADBCDDAECBABAECBEBAEEDADEAB,BDECDAADCECCCDCCEBACBBAAAEECABABEECEBEEDCADBDB...,BEEDAEABDDCEBDBAAAAACXCBCCCBCCDBDEECBDCABEECD,1.0,100.0,40.0,40.0,120.0,40.0,340.0,B,E,B,B,4,D,A,B,B,A,A,B,A,B,A,B,A,A,B,B,A,D,A,B,B


# Data treatment tests

In [51]:
df_second = pd.read_csv('./temp_data/enem_responses_dataset.csv', encoding="ISO-8859-1", sep=",")

In [54]:
df = df_second[['NU_INSCRICAO','TX_RESPOSTAS_CN']]
df_answers = df['TX_RESPOSTAS_CN'].str.split('', n=45, expand=True)
df_answers = df_answers.iloc[: , 1:]
df = pd.concat([df['NU_INSCRICAO'], df_answers], axis=1)
df.rename(columns={'NU_INSCRICAO':"question_number"}, inplace=True)
df.to_csv('enem_responses_CN_dataset.csv', index=False)

In [55]:
df = df_second[['NU_INSCRICAO','TX_RESPOSTAS_CH']]
df_answers = df['TX_RESPOSTAS_CH'].str.split('', n=45, expand=True)
df_answers = df_answers.iloc[: , 1:]
df = pd.concat([df['NU_INSCRICAO'], df_answers], axis=1)
df.rename(columns={'NU_INSCRICAO':"question_number"}, inplace=True)
df.to_csv('enem_responses_CH_dataset.csv', index=False)

In [56]:
df = df_second[['NU_INSCRICAO','TX_RESPOSTAS_LC']]
df_answers = df['TX_RESPOSTAS_LC'].str.split('', n=45, expand=True)
df_answers = df_answers.iloc[: , 1:]
df = pd.concat([df['NU_INSCRICAO'], df_answers], axis=1)
df.rename(columns={'NU_INSCRICAO':"question_number"}, inplace=True)
df.to_csv('enem_responses_LC_dataset.csv', index=False)

In [53]:
df = df_second[['NU_INSCRICAO','TX_RESPOSTAS_MT']]
df_answers = df['TX_RESPOSTAS_MT'].str.split('', n=45, expand=True)
df_answers = df_answers.iloc[: , 1:]
df = pd.concat([df['NU_INSCRICAO'], df_answers], axis=1)
df.rename(columns={'NU_INSCRICAO':"question_number"}, inplace=True)
df.to_csv('enem_responses_MT_dataset.csv', index=False)