# Prevendo a nota de Matemática do ENEM 2016

___

In [23]:
# Importando as bibliotecas necessárias
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [3]:
# Carregando os datasets
treino = pd.read_csv('train.csv')
teste = pd.read_csv('test.csv')

# Selecionando algumas colunas para retirar
retira_colunas = ['NU_INSCRICAO',
                  'CO_PROVA_CN',
                  'CO_PROVA_CH',
                  'CO_PROVA_LC',
                  'CO_PROVA_MT'
                 ]
inscricao_treino = treino[['NU_INSCRICAO']]
inscricao_teste = teste[['NU_INSCRICAO']]
treino.drop(retira_colunas, axis=1, inplace=True)
teste.drop(retira_colunas, axis=1, inplace=True)

# Mostrando o datasets depois da retirada das colunas
treino

Unnamed: 0.1,Unnamed: 0,NU_ANO,CO_MUNICIPIO_RESIDENCIA,NO_MUNICIPIO_RESIDENCIA,CO_UF_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,...,Q041,Q042,Q043,Q044,Q045,Q046,Q047,Q048,Q049,Q050
0,1,2016,4314902,Porto Alegre,43,RS,24,M,0.0,1,...,5.0,A,A,A,A,A,A,A,B,D
1,2,2016,2304707,Granja,23,CE,17,F,0.0,3,...,,A,A,C,A,B,A,A,C,A
2,3,2016,2304400,Fortaleza,23,CE,21,F,0.0,3,...,,A,A,A,A,C,A,A,B,A
3,4,2016,3304557,Rio de Janeiro,33,RJ,25,F,0.0,0,...,5.0,C,A,A,A,A,D,A,A,A
4,5,2016,1302603,Manaus,13,AM,28,M,0.0,2,...,,A,A,A,A,A,A,A,A,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13725,4582,2016,3502903,Araçoiaba da Serra,35,SP,20,M,0.0,1,...,,A,A,A,A,A,A,A,A,A
13726,4583,2016,2613800,São Vicente Ferrer,26,PE,33,F,0.0,3,...,,A,A,B,B,A,A,A,A,A
13727,4584,2016,4322186,Tupanci do Sul,43,RS,15,F,0.0,1,...,,A,A,A,A,C,A,A,A,A
13728,4585,2016,3303203,Nilópolis,33,RJ,36,M,1.0,3,...,5.0,A,A,A,A,D,A,A,A,D


In [4]:
presencas = ['TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC']
presencas

['TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC']

In [5]:
# Selecionando as variáveis numéricas do dataset de teste
numeric_features = teste.select_dtypes(include="number").columns.to_list()

# Selecioando as variáveis categóricas do dataset de teste
categoric_feature = [coluna for coluna in teste.columns if coluna not in numeric_features]

['SG_UF_RESIDENCIA',
 'TP_SEXO',
 'Q001',
 'Q002',
 'Q006',
 'Q024',
 'Q025',
 'Q026',
 'Q027',
 'Q047']

In [6]:
# Juntando todas as variáveis escolhidas
all_features = numeric_features + categoric_feature + ['NU_NOTA_MT']

# Aplicando as colunas escolhidas no dataset de treino
treino = treino[all_features]
treino

Unnamed: 0,CO_UF_RESIDENCIA,NU_IDADE,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ENSINO,IN_TREINEIRO,TP_DEPENDENCIA_ADM_ESC,...,TP_SEXO,Q001,Q002,Q006,Q024,Q025,Q026,Q027,Q047,NU_NOTA_MT
0,43,24,1,1,1,4,1,,0,,...,M,D,D,C,A,A,C,H,A,399.4
1,23,17,3,1,2,0,2,1.0,0,2.0,...,F,A,A,B,A,A,A,,A,459.8
2,23,21,3,1,3,0,1,,0,,...,F,D,D,C,A,A,A,,A,
3,33,25,0,1,1,9,1,,0,,...,F,H,E,E,C,B,C,F,D,
4,13,28,2,1,1,4,1,,0,,...,M,E,D,C,A,A,B,F,A,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13725,35,20,1,1,1,3,1,,0,,...,M,E,E,G,B,B,B,C,A,403.2
13726,26,33,3,2,1,10,1,,0,,...,F,A,A,C,A,B,B,I,A,452.4
13727,43,15,1,1,3,0,1,,1,,...,F,C,G,B,A,B,A,,A,398.0
13728,33,36,3,1,4,0,1,,0,,...,M,B,D,G,B,B,C,E,A,386.6


In [7]:
# Vendo a quantidade de NaN's nos dados de treino
treino.isna().sum()#.sum()

CO_UF_RESIDENCIA             0
NU_IDADE                     0
TP_COR_RACA                  0
TP_NACIONALIDADE             0
TP_ST_CONCLUSAO              0
TP_ANO_CONCLUIU              0
TP_ESCOLA                    0
TP_ENSINO                 9448
IN_TREINEIRO                 0
TP_DEPENDENCIA_ADM_ESC    9448
IN_BAIXA_VISAO               0
IN_CEGUEIRA                  0
IN_SURDEZ                    0
IN_DISLEXIA                  0
IN_DISCALCULIA               0
IN_SABATISTA                 0
IN_GESTANTE                  0
IN_IDOSO                     0
TP_PRESENCA_CN               0
TP_PRESENCA_CH               0
TP_PRESENCA_LC               0
NU_NOTA_CN                3389
NU_NOTA_CH                3389
NU_NOTA_LC                3597
TP_LINGUA                    0
TP_STATUS_REDACAO         3597
NU_NOTA_COMP1             3597
NU_NOTA_COMP2             3597
NU_NOTA_COMP3             3597
NU_NOTA_COMP4             3597
NU_NOTA_COMP5             3597
NU_NOTA_REDACAO           3597
SG_UF_RE

In [8]:
# Vendo a quantidade de NaN's nos dados de teste
teste.isna().sum()#.sum()

CO_UF_RESIDENCIA             0
SG_UF_RESIDENCIA             0
NU_IDADE                     0
TP_SEXO                      0
TP_COR_RACA                  0
TP_NACIONALIDADE             0
TP_ST_CONCLUSAO              0
TP_ANO_CONCLUIU              0
TP_ESCOLA                    0
TP_ENSINO                 3096
IN_TREINEIRO                 0
TP_DEPENDENCIA_ADM_ESC    3096
IN_BAIXA_VISAO               0
IN_CEGUEIRA                  0
IN_SURDEZ                    0
IN_DISLEXIA                  0
IN_DISCALCULIA               0
IN_SABATISTA                 0
IN_GESTANTE                  0
IN_IDOSO                     0
TP_PRESENCA_CN               0
TP_PRESENCA_CH               0
TP_PRESENCA_LC               0
NU_NOTA_CN                1134
NU_NOTA_CH                1134
NU_NOTA_LC                1199
TP_LINGUA                    0
TP_STATUS_REDACAO         1199
NU_NOTA_COMP1             1199
NU_NOTA_COMP2             1199
NU_NOTA_COMP3             1199
NU_NOTA_COMP4             1199
NU_NOTA_

In [9]:
teste['NU_NOTA_LC'].isna().sum()

1199

In [10]:
# Fazendo as trocas dos dados faltantes
for categorica in categoric_feature:
    
    treino[categorica].fillna(method='ffill', inplace=True)
    teste[categorica].fillna(method='ffill', inplace=True)
    
for categorica in categoric_feature:
    
    treino[categorica].fillna(method='bfill', inplace=True)
    teste[categorica].fillna(method='bfill', inplace=True)

for numerica in numeric_features:
        
    #ratings_petz_apple[f'{num_stars}-star_diff'].mask(cond=ratings_petz_apple[f'{num_stars}-star_diff'] < 0, other=0)
    media_treino = treino[numerica].mean()
    treino[numerica].fillna(0, inplace=True)
    
    media_teste = teste[numerica].mean()
    teste[numerica].fillna(0, inplace=True)
    
treino['NU_NOTA_MT'].fillna(0, inplace=True)

In [11]:
# Conferindo se deu tudo certo com os dados de treino
treino.isna().sum().sum()

0

In [12]:
# Conferindo se deu tudo certo com os dados de teste
teste.isna().sum().sum()

0

In [13]:
# Instanciando um objeto da classe para o StandardScalar
std_scaler = StandardScaler()

In [14]:
# Aplicando o StandardScalar nas variáveis numéricas
treino[numeric_features] = std_scaler.fit_transform(treino[numeric_features])
teste[numeric_features] = std_scaler.fit_transform(teste[numeric_features])

In [15]:
# Conferindo os dados de treino
treino

Unnamed: 0,CO_UF_RESIDENCIA,NU_IDADE,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ENSINO,IN_TREINEIRO,TP_DEPENDENCIA_ADM_ESC,...,TP_SEXO,Q001,Q002,Q006,Q024,Q025,Q026,Q027,Q047,NU_NOTA_MT
0,1.243484,0.341624,-1.177119,-0.188775,-0.888684,0.576221,-0.633058,-0.582810,-0.386319,-0.630839,...,M,D,D,C,A,A,C,H,A,399.4
1,-0.784493,-0.652215,0.808488,-0.188775,0.246774,-0.667092,1.157607,1.002369,-0.386319,1.159451,...,F,A,A,B,A,A,A,H,A,459.8
2,-0.784493,-0.084307,0.808488,-0.188775,1.382232,-0.667092,-0.633058,-0.582810,-0.386319,-0.630839,...,F,D,D,C,A,A,A,H,A,0.0
3,0.229495,0.483601,-2.169923,-0.188775,-0.888684,2.130362,-0.633058,-0.582810,-0.386319,-0.630839,...,F,H,E,E,C,B,C,F,D,0.0
4,-1.798482,0.909531,-0.184316,-0.188775,-0.888684,0.576221,-0.633058,-0.582810,-0.386319,-0.630839,...,M,E,D,C,A,A,B,F,A,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13725,0.432293,-0.226284,-1.177119,-0.188775,-0.888684,0.265393,-0.633058,-0.582810,-0.386319,-0.630839,...,M,E,E,G,B,B,B,C,A,403.2
13726,-0.480297,1.619416,0.808488,3.673938,-0.888684,2.441190,-0.633058,-0.582810,-0.386319,-0.630839,...,F,A,A,C,A,B,B,I,A,452.4
13727,1.243484,-0.936169,-1.177119,-0.188775,1.382232,-0.667092,-0.633058,-0.582810,2.588533,-0.630839,...,F,C,G,B,A,B,A,I,A,398.0
13728,0.229495,2.045347,0.808488,-0.188775,2.517691,-0.667092,-0.633058,-0.582810,-0.386319,-0.630839,...,M,B,D,G,B,B,C,E,A,386.6


In [16]:
# Aplicando o get_dummies() nas variáveis categóricas
encoded_columns = pd.get_dummies(treino[categoric_feature])
treino = treino.join(encoded_columns).drop(categoric_feature, axis=1)

encoded_columns = pd.get_dummies(teste[categoric_feature])
teste = teste.join(encoded_columns).drop(categoric_feature, axis=1)


#ohe.fit(treino[categoric_feature])
#ohe.fit(teste[categoric_feature])

In [17]:
# Instanciando um objeto da classe RandonForestRegressor()
model = RandomForestRegressor()

In [18]:
# Separando X e y de treino
X_train = treino.drop('NU_NOTA_MT', axis=1)
y_train = treino['NU_NOTA_MT']

In [19]:
# Para calcular o tempo de processamento desta célula
%%time

# Treinando o modelo
model.fit(X_train, y_train)

# Testando o modelo
y_test = model.predict(teste)

Wall time: 26.2 s


In [20]:
# Preparando a saída
y_test = pd.DataFrame(y_test)
y_test.rename(columns={0: 'NU_NOTA_MT'}, inplace=True)
y_test

Unnamed: 0,NU_NOTA_MT
0,419.557
1,433.167
2,591.324
3,0.000
4,553.028
...,...
4571,457.537
4572,492.132
4573,679.097
4574,450.012


In [21]:
# Juntando o número de inscrição aos dados preditos pelo modelo
my_answer = inscricao_teste.join(y_test)
my_answer

Unnamed: 0,NU_INSCRICAO,NU_NOTA_MT
0,73ff9fcc02f0a99919906c942c2e1a1042cdcf98,419.557
1,71a95f9f1b91a82c65ad94abbdf9f54e6066f968,433.167
2,b38a03232f43b11c9d0788abaf060f7366053b6d,591.324
3,70b682d9a3636be23f6120fa9d6b164eb3c6002d,0.000
4,715494628a50142ce8cb17191cfe6d0f3cae0934,553.028
...,...,...
4571,dac0f22429c7f8e3931d0abaf5dfc8e5c772a48b,457.537
4572,a75fa8770257e7c9368d059fe53d9ef431f4bdef,492.132
4573,655fa6306720ff16e825903b5422a46608a77545,679.097
4574,1f4bc3e3d56212d500625bfe8ac78ccff4362293,450.012


In [22]:
my_answer.to_csv('answer.csv', index=False)