### Install bibliotecas

In [None]:
!pip3 install pycaret[full] --quiet

In [2]:
!pip3 uninstall lightgbm -y --quiet

In [3]:
!pip3 install lightgbm --install-option=--gpu --install-option="--opencl-include-dir=/usr/local/cuda/include/" --install-option="--opencl-library=/usr/local/cuda/lib64/libOpenCL.so" --quiet

[?25h    Running setup.py install for lightgbm ... [?25l[?25hdone


In [5]:
!pip3 install xgboost --quiet

### Leitura dos Dados

In [160]:
import pandas as pd
import numpy as np 
import pycaret
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive

In [161]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [162]:
microdados_rj = pd.read_csv("/content/gdrive/Shareddrives/TAAED - ENEM/DADOS/MICRODADOS_RJ_ENEM_2019.csv")

print(microdados_rj.info(verbose=True, null_counts=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338710 entries, 0 to 338709
Data columns (total 136 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   NU_INSCRICAO                 338710 non-null  int64  
 1   NU_ANO                       338710 non-null  int64  
 2   CO_MUNICIPIO_RESIDENCIA      338710 non-null  int64  
 3   NO_MUNICIPIO_RESIDENCIA      338710 non-null  object 
 4   CO_UF_RESIDENCIA             338710 non-null  int64  
 5   SG_UF_RESIDENCIA             338710 non-null  object 
 6   NU_IDADE                     338707 non-null  float64
 7   TP_SEXO                      338710 non-null  object 
 8   TP_ESTADO_CIVIL              338710 non-null  int64  
 9   TP_COR_RACA                  338710 non-null  int64  
 10  TP_NACIONALIDADE             338710 non-null  int64  
 11  CO_MUNICIPIO_NASCIMENTO      331626 non-null  float64
 12  NO_MUNICIPIO_NASCIMENTO      331626 non-null  object 
 13

### Transformação de features

#### Feature: Recém formado

In [163]:
microdados_rj['RECEM_FORMADO'] = 100

In [164]:
microdados_rj['TP_ANO_CONCLUIU'].value_counts()

0     148705
1      46886
13     29427
2      29159
3      18405
4      14589
5      11373
6       9123
7       7476
8       6025
9       5184
10      4640
11      3999
12      3719
Name: TP_ANO_CONCLUIU, dtype: int64

In [165]:
microdados_rj['Q025']

0         A
1         B
2         B
3         A
4         B
         ..
338705    B
338706    A
338707    B
338708    A
338709    B
Name: Q025, Length: 338710, dtype: object

In [166]:
microdados_rj.RECEM_FORMADO[(microdados_rj.TP_ST_CONCLUSAO == 2) & 
                            ((microdados_rj.TP_ANO_CONCLUIU == 0) | 
                             (microdados_rj.TP_ANO_CONCLUIU == 1 ))] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [167]:
microdados_rj['RECEM_FORMADO'].value_counts()

100    237667
1      101043
Name: RECEM_FORMADO, dtype: int64

In [168]:
microdados_rj.RECEM_FORMADO[(microdados_rj.RECEM_FORMADO != 1)] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [169]:
microdados_rj['RECEM_FORMADO'].value_counts()

0    237667
1    101043
Name: RECEM_FORMADO, dtype: int64

In [170]:
microdados_rj['RECEM_FORMADO'].dtype

dtype('int64')

#### Feature: Renda Familiar per Capita

In [171]:
condicao = [ microdados_rj['Q006'] =='A', microdados_rj['Q006']=='B', microdados_rj['Q006']=='C', microdados_rj['Q006']=='D', microdados_rj['Q006']=='E', microdados_rj['Q006']=='F', microdados_rj['Q006']=='G', microdados_rj['Q006']=='H',microdados_rj['Q006']=='I', microdados_rj['Q006']=='J', microdados_rj['Q006']=='K', microdados_rj['Q006']=='L', microdados_rj['Q006']=='M', microdados_rj['Q006']=='N', microdados_rj['Q006']=='O', microdados_rj['Q006']=='P']
resultado = [(1*954)/microdados_rj['Q005'], (1*954)/microdados_rj['Q005'], (1.5*954)/microdados_rj['Q005'], (2*954)/microdados_rj['Q005'], 2.5*954/microdados_rj['Q005'], 3*954/microdados_rj['Q005'], 4*954/microdados_rj['Q005'], 5*954/microdados_rj['Q005'], 6*954/microdados_rj['Q005'], 7*954/microdados_rj['Q005'], 8*954/microdados_rj['Q005'], 9*954/microdados_rj['Q005'], 10*954/microdados_rj['Q005'], 12*954/microdados_rj['Q005'], 15*954/microdados_rj['Q005'], 20*954/microdados_rj['Q005']]

microdados_rj['RENDA_PERCAPITA_FAMILIAR'] = np.select(condicao, resultado, 50*954/microdados_rj['Q005'])

In [172]:
microdados_rj['RENDA_PERCAPITA_FAMILIAR'].value_counts().head(15)

477.00     59243
954.00     26711
357.75     25559
715.50     22481
318.00     20662
238.50     20282
636.00     12652
596.25     12160
286.20     11884
795.00     10826
1192.50    10254
1431.00     9887
1908.00     9189
190.80      8121
1272.00     6622
Name: RENDA_PERCAPITA_FAMILIAR, dtype: int64

In [173]:
microdados_rj['CLASSE_RENDA_PERCAPITA_FAMILIAR'] = 0

In [174]:
condicao = [microdados_rj['RENDA_PERCAPITA_FAMILIAR'] < 250, 
            microdados_rj['RENDA_PERCAPITA_FAMILIAR'] < 500,  
            microdados_rj['RENDA_PERCAPITA_FAMILIAR'] < 750, 
            microdados_rj['RENDA_PERCAPITA_FAMILIAR'] < 1000,
            microdados_rj['RENDA_PERCAPITA_FAMILIAR'] < 1500,
            microdados_rj['RENDA_PERCAPITA_FAMILIAR'] < 2500,
            microdados_rj['RENDA_PERCAPITA_FAMILIAR'] < 5000,
            microdados_rj['RENDA_PERCAPITA_FAMILIAR'] >= 5000]
            
resultado = [0, 1, 2, 3, 4, 5, 6, 7]

microdados_rj['CLASSE_RENDA_PERCAPITA_FAMILIAR'] = np.select(condicao, resultado, 4)

In [175]:
microdados_rj.CLASSE_RENDA_PERCAPITA_FAMILIAR.value_counts().sort_index()

0     36466
1    125497
2     50257
3     40786
4     29862
5     26852
6     19383
7      9607
Name: CLASSE_RENDA_PERCAPITA_FAMILIAR, dtype: int64

In [176]:
microdados_rj.CLASSE_RENDA_PERCAPITA_FAMILIAR.dtype

dtype('int64')

#### Feature: Cor Raça

In [177]:
microdados_rj.TP_COR_RACA.value_counts()

1    141825
3    122212
2     60182
0      7812
4      5505
5      1174
Name: TP_COR_RACA, dtype: int64

In [178]:
microdados_rj['CLASSE_TP_COR_RACA'] = 100

In [179]:
microdados_rj.CLASSE_TP_COR_RACA[(microdados_rj.TP_COR_RACA == 1) | (microdados_rj.TP_COR_RACA == 4)] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [180]:
microdados_rj.CLASSE_TP_COR_RACA[(microdados_rj.TP_COR_RACA == 2) | (microdados_rj.TP_COR_RACA == 3)] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [181]:
microdados_rj.CLASSE_TP_COR_RACA[(microdados_rj.TP_COR_RACA == 5)] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [182]:
microdados_rj.CLASSE_TP_COR_RACA.value_counts()

1      182394
0      147330
100      7812
2        1174
Name: CLASSE_TP_COR_RACA, dtype: int64

In [183]:
microdados_rj.CLASSE_TP_COR_RACA.value_counts(dropna=False)

1      182394
0      147330
100      7812
2        1174
Name: CLASSE_TP_COR_RACA, dtype: int64

In [184]:
microdados_rj.shape

(338710, 140)

In [185]:
microdados_rj.CLASSE_TP_COR_RACA.value_counts().sum()

338710

In [186]:
microdados_rj.TP_COR_RACA.value_counts().sum()

338710

In [187]:
microdados_rj[microdados_rj['CLASSE_TP_COR_RACA'] != 100].shape

(330898, 140)

In [188]:
microdados_rj = microdados_rj[microdados_rj['CLASSE_TP_COR_RACA'] != 100]

In [189]:
microdados_rj.shape

(330898, 140)

In [190]:
microdados_rj.CLASSE_TP_COR_RACA.value_counts().sort_index()

0    147330
1    182394
2      1174
Name: CLASSE_TP_COR_RACA, dtype: int64

#### Feature: Idade

In [191]:
microdados_rj['CLASSE_IDADE'] = 100

In [192]:
condicao = [microdados_rj['NU_IDADE'] <= 14, microdados_rj['NU_IDADE'] == 15, microdados_rj['NU_IDADE'] == 16, microdados_rj['NU_IDADE'] == 17, microdados_rj['NU_IDADE'] == 18, microdados_rj['NU_IDADE'] == 19, microdados_rj['NU_IDADE'] ==20, microdados_rj['NU_IDADE'] == 21, microdados_rj['NU_IDADE'] == 22]
resultado = [0, 1, 1, 2, 3, 4, 5, 6, 6]

microdados_rj['CLASSE_IDADE'] = np.select(condicao, resultado, 7)

In [193]:
microdados_rj.CLASSE_IDADE.value_counts().sort_index()

0      101
1    19317
2    46371
3    59844
4    43391
5    29401
6    35118
7    97355
Name: CLASSE_IDADE, dtype: int64

#### Feature: Escolaridade dos pais

In [194]:
microdados_rj.Q001.value_counts()

E    104559
B     53001
C     42387
D     40611
F     31315
H     30060
G     19638
A      9327
Name: Q001, dtype: int64

In [195]:
microdados_rj['CLASSE_ESCOLARIDADE_PAI'] = 100

In [196]:
microdados_rj.CLASSE_ESCOLARIDADE_PAI[(microdados_rj.Q001=='A') | 
                                      (microdados_rj.Q001 == 'B') | 
                                      (microdados_rj.Q001 == 'C') | 
                                      (microdados_rj.Q001 == 'D')] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [197]:
microdados_rj.CLASSE_ESCOLARIDADE_PAI[(microdados_rj.Q001 == 'E')] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [198]:
microdados_rj.CLASSE_ESCOLARIDADE_PAI[(microdados_rj.Q001 == 'F') | 
                                      (microdados_rj.Q001 == 'G')] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [199]:
microdados_rj.CLASSE_ESCOLARIDADE_PAI.value_counts().sort_index()

0      145326
1      104559
2       50953
100     30060
Name: CLASSE_ESCOLARIDADE_PAI, dtype: int64

In [200]:
microdados_rj.Q002.value_counts()

E    122195
D     46471
B     42418
C     40850
F     37260
G     25867
H      8303
A      7534
Name: Q002, dtype: int64

In [201]:
microdados_rj['CLASSE_ESCOLARIDADE_MAE'] = 100

In [202]:
microdados_rj.CLASSE_ESCOLARIDADE_MAE[(microdados_rj.Q002 == 'A') | 
                                      (microdados_rj.Q002 == 'B') | 
                                      (microdados_rj.Q002 == 'C') | 
                                      (microdados_rj.Q002 == 'D')] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [203]:
microdados_rj.CLASSE_ESCOLARIDADE_MAE[(microdados_rj.Q002 == 'E')] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [204]:
microdados_rj.CLASSE_ESCOLARIDADE_MAE[(microdados_rj.Q002 == 'F') | 
                                      (microdados_rj.Q002 == 'G')] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [205]:
microdados_rj.CLASSE_ESCOLARIDADE_MAE.value_counts().sort_index()

0      137273
1      122195
2       63127
100      8303
Name: CLASSE_ESCOLARIDADE_MAE, dtype: int64

In [206]:
microdados_rj['Q001_Q002'] = 100

In [207]:
microdados_rj.Q001_Q002[(microdados_rj.CLASSE_ESCOLARIDADE_PAI == 0) | 
                        (microdados_rj.CLASSE_ESCOLARIDADE_MAE == 0)] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [208]:
microdados_rj.Q001_Q002.value_counts()

0      188570
100    142328
Name: Q001_Q002, dtype: int64

In [209]:
microdados_rj.Q001_Q002[(microdados_rj.CLASSE_ESCOLARIDADE_PAI == 1) | 
                        (microdados_rj.CLASSE_ESCOLARIDADE_MAE == 1)] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [210]:
microdados_rj.Q001_Q002.value_counts()

1      170700
0      120695
100     39503
Name: Q001_Q002, dtype: int64

In [211]:
microdados_rj.Q001_Q002[(microdados_rj.CLASSE_ESCOLARIDADE_PAI == 2) | 
                        (microdados_rj.CLASSE_ESCOLARIDADE_MAE == 2)] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [212]:
microdados_rj.Q001_Q002.value_counts()

1      134444
0      107833
2       83087
100      5534
Name: Q001_Q002, dtype: int64

In [213]:
microdados_rj.shape

(330898, 144)

In [214]:
microdados_rj = microdados_rj[microdados_rj['Q001_Q002'] != 100]

In [215]:
microdados_rj.shape

(325364, 144)

#### Feature: Computador

In [216]:
microdados_rj['Q024'].value_counts()

B    162970
A    113217
C     33419
D     10870
E      4888
Name: Q024, dtype: int64

In [217]:
microdados_rj.Q024[(microdados_rj.Q024 == 'A')] = 0
microdados_rj.Q024[(microdados_rj.Q024 != 0)] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [218]:
microdados_rj.Q024.value_counts()

1    212147
0    113217
Name: Q024, dtype: int64

#### Feature: Internet

In [219]:
microdados_rj['Q025'].value_counts()

B    283221
A     42143
Name: Q025, dtype: int64

In [220]:
microdados_rj.Q025[(microdados_rj.Q025 == 'A')] = 0
microdados_rj.Q025[(microdados_rj.Q025 == 'B')] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [221]:
microdados_rj['Q025'].value_counts()

1    283221
0     42143
Name: Q025, dtype: int64

### Transformação dos targets

#### Rendimento Mat

In [222]:
microdados_rj['CLASSE_RENDIMENTO'] = 'A'

In [223]:
condicao = [microdados_rj['NU_NOTA_MT'] < 450, 
            microdados_rj['NU_NOTA_MT'] < 650, 
            microdados_rj['NU_NOTA_MT'] >= 650]
            
resultado = ['INSUFICIENTE', 'REGULAR', 'EXCELENTE']

microdados_rj['CLASSE_RENDIMENTO'] = np.select(condicao, resultado)

In [224]:
microdados_rj['CLASSE_RENDIMENTO'].value_counts()

REGULAR         130121
0                89531
INSUFICIENTE     61591
EXCELENTE        44121
Name: CLASSE_RENDIMENTO, dtype: int64

In [225]:
microdados_rj.loc[microdados_rj['CLASSE_RENDIMENTO'] == '0', 'CLASSE_RENDIMENTO'] = np.nan

In [226]:
microdados_rj['CLASSE_RENDIMENTO'].value_counts()

REGULAR         130121
INSUFICIENTE     61591
EXCELENTE        44121
Name: CLASSE_RENDIMENTO, dtype: int64

#### Rendimento Geral

In [227]:
microdados_rj['MEDIA_GERAL'] = (microdados_rj['NU_NOTA_CN'] + 
                                microdados_rj['NU_NOTA_CH'] + 
                                microdados_rj['NU_NOTA_LC'] + 
                                microdados_rj['NU_NOTA_MT'] + 
                                microdados_rj['NU_NOTA_REDACAO']) / 5

In [228]:
condicao = [microdados_rj['MEDIA_GERAL'] < 450, 
            microdados_rj['MEDIA_GERAL'] < 650, 
            microdados_rj['MEDIA_GERAL'] >= 650]
            
resultado = ['INSUFICIENTE', 'REGULAR', 'EXCELENTE']

microdados_rj['CLASSE_RENDIMENTO_GERAL'] = np.select(condicao, resultado)

In [229]:
microdados_rj['CLASSE_RENDIMENTO_GERAL'].value_counts()

REGULAR         178637
0                90085
INSUFICIENTE     30344
EXCELENTE        26298
Name: CLASSE_RENDIMENTO_GERAL, dtype: int64

In [230]:
microdados_rj.loc[microdados_rj['CLASSE_RENDIMENTO_GERAL'] == '0', 'CLASSE_RENDIMENTO_GERAL'] = np.nan

In [231]:
microdados_rj['CLASSE_RENDIMENTO_GERAL'].value_counts()

REGULAR         178637
INSUFICIENTE     30344
EXCELENTE        26298
Name: CLASSE_RENDIMENTO_GERAL, dtype: int64

### Filtragem do dataset e save dos dados

In [241]:
microdados_rj[['RECEM_FORMADO','CLASSE_RENDA_PERCAPITA_FAMILIAR','CLASSE_TP_COR_RACA','CLASSE_IDADE','Q001_Q002','Q024','Q025','CLASSE_RENDIMENTO', 'CLASSE_RENDIMENTO_GERAL']]

Unnamed: 0,RECEM_FORMADO,CLASSE_RENDA_PERCAPITA_FAMILIAR,CLASSE_TP_COR_RACA,CLASSE_IDADE,Q001_Q002,Q024,Q025,CLASSE_RENDIMENTO,CLASSE_RENDIMENTO_GERAL
0,0,1,1,6,0,0,0,,
1,0,3,1,7,0,1,1,REGULAR,INSUFICIENTE
2,0,1,1,6,0,1,1,REGULAR,REGULAR
3,0,1,1,6,0,1,0,INSUFICIENTE,REGULAR
4,0,1,0,7,0,1,1,INSUFICIENTE,REGULAR
...,...,...,...,...,...,...,...,...,...
325359,0,5,1,7,1,0,1,,
325360,0,0,1,7,0,0,0,REGULAR,REGULAR
325361,0,1,1,7,1,0,1,INSUFICIENTE,REGULAR
325362,0,2,1,5,0,0,0,INSUFICIENTE,REGULAR


In [242]:
microdados_rj[['RECEM_FORMADO','CLASSE_RENDA_PERCAPITA_FAMILIAR','CLASSE_TP_COR_RACA','CLASSE_IDADE','Q001_Q002','Q024','Q025','CLASSE_RENDIMENTO', 'CLASSE_RENDIMENTO_GERAL']].isna().sum()

RECEM_FORMADO                          0
CLASSE_RENDA_PERCAPITA_FAMILIAR        0
CLASSE_TP_COR_RACA                     0
CLASSE_IDADE                           0
Q001_Q002                              0
Q024                                   0
Q025                                   0
CLASSE_RENDIMENTO                  89531
CLASSE_RENDIMENTO_GERAL            90085
dtype: int64

In [243]:
microdados_rj = microdados_rj[['RECEM_FORMADO','CLASSE_RENDA_PERCAPITA_FAMILIAR','CLASSE_TP_COR_RACA','CLASSE_IDADE','Q001_Q002','Q024','Q025','CLASSE_RENDIMENTO', 'CLASSE_RENDIMENTO_GERAL']]

In [244]:
microdados_rj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325364 entries, 0 to 325363
Data columns (total 9 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   RECEM_FORMADO                    325364 non-null  int64 
 1   CLASSE_RENDA_PERCAPITA_FAMILIAR  325364 non-null  int64 
 2   CLASSE_TP_COR_RACA               325364 non-null  int64 
 3   CLASSE_IDADE                     325364 non-null  int64 
 4   Q001_Q002                        325364 non-null  int64 
 5   Q024                             325364 non-null  int64 
 6   Q025                             325364 non-null  int64 
 7   CLASSE_RENDIMENTO                235833 non-null  object
 8   CLASSE_RENDIMENTO_GERAL          235279 non-null  object
dtypes: int64(7), object(2)
memory usage: 22.3+ MB


In [245]:
microdados_rj['Q024'] = microdados_rj['Q024'].astype(int)
microdados_rj['Q025'] = microdados_rj['Q025'].astype(int)

In [246]:
microdados_rj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325364 entries, 0 to 325363
Data columns (total 9 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   RECEM_FORMADO                    325364 non-null  int64 
 1   CLASSE_RENDA_PERCAPITA_FAMILIAR  325364 non-null  int64 
 2   CLASSE_TP_COR_RACA               325364 non-null  int64 
 3   CLASSE_IDADE                     325364 non-null  int64 
 4   Q001_Q002                        325364 non-null  int64 
 5   Q024                             325364 non-null  int64 
 6   Q025                             325364 non-null  int64 
 7   CLASSE_RENDIMENTO                235833 non-null  object
 8   CLASSE_RENDIMENTO_GERAL          235279 non-null  object
dtypes: int64(7), object(2)
memory usage: 22.3+ MB


In [247]:
microdados_rj.reset_index(drop=True, inplace=True)

microdados_rj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325364 entries, 0 to 325363
Data columns (total 9 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   RECEM_FORMADO                    325364 non-null  int64 
 1   CLASSE_RENDA_PERCAPITA_FAMILIAR  325364 non-null  int64 
 2   CLASSE_TP_COR_RACA               325364 non-null  int64 
 3   CLASSE_IDADE                     325364 non-null  int64 
 4   Q001_Q002                        325364 non-null  int64 
 5   Q024                             325364 non-null  int64 
 6   Q025                             325364 non-null  int64 
 7   CLASSE_RENDIMENTO                235833 non-null  object
 8   CLASSE_RENDIMENTO_GERAL          235279 non-null  object
dtypes: int64(7), object(2)
memory usage: 22.3+ MB


In [248]:
microdados_rj.to_csv("/content/gdrive/Shareddrives/TAAED - ENEM/DADOS/dataframe_tratado.csv", 
                     index=False,
                     header=True, 
                     encoding="utf-8")

In [249]:
microdados_rj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325364 entries, 0 to 325363
Data columns (total 9 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   RECEM_FORMADO                    325364 non-null  int64 
 1   CLASSE_RENDA_PERCAPITA_FAMILIAR  325364 non-null  int64 
 2   CLASSE_TP_COR_RACA               325364 non-null  int64 
 3   CLASSE_IDADE                     325364 non-null  int64 
 4   Q001_Q002                        325364 non-null  int64 
 5   Q024                             325364 non-null  int64 
 6   Q025                             325364 non-null  int64 
 7   CLASSE_RENDIMENTO                235833 non-null  object
 8   CLASSE_RENDIMENTO_GERAL          235279 non-null  object
dtypes: int64(7), object(2)
memory usage: 22.3+ MB


## Modelagem 

#### Experimento 1 - Dados do RJ, 2019, mesmas features do artigo e target média

In [250]:
microdados_rj = pd.read_csv("/content/gdrive/Shareddrives/TAAED - ENEM/DADOS/dataframe_tratado.csv")

print(microdados_rj.info(verbose=True, null_counts=True))

microdados_rj = microdados_rj[microdados_rj['CLASSE_RENDIMENTO_GERAL'] != '0']

microdados_rj.reset_index(drop=True, inplace=True)

print(microdados_rj.info(verbose=True, null_counts=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325364 entries, 0 to 325363
Data columns (total 9 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   RECEM_FORMADO                    325364 non-null  int64 
 1   CLASSE_RENDA_PERCAPITA_FAMILIAR  325364 non-null  int64 
 2   CLASSE_TP_COR_RACA               325364 non-null  int64 
 3   CLASSE_IDADE                     325364 non-null  int64 
 4   Q001_Q002                        325364 non-null  int64 
 5   Q024                             325364 non-null  int64 
 6   Q025                             325364 non-null  int64 
 7   CLASSE_RENDIMENTO                235833 non-null  object
 8   CLASSE_RENDIMENTO_GERAL          235279 non-null  object
dtypes: int64(7), object(2)
memory usage: 22.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325364 entries, 0 to 325363
Data columns (total 9 columns):
 #   Column                           Non-

In [251]:
microdados_rj.CLASSE_RENDIMENTO_GERAL[(microdados_rj.CLASSE_RENDIMENTO_GERAL == 'INSUFICIENTE')] = 0
microdados_rj.CLASSE_RENDIMENTO_GERAL[(microdados_rj.CLASSE_RENDIMENTO_GERAL == 'REGULAR')] = 1
microdados_rj.CLASSE_RENDIMENTO_GERAL[(microdados_rj.CLASSE_RENDIMENTO_GERAL == 'EXCELENTE')] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [252]:
from pycaret.classification import *

init = setup(data=microdados_rj,
             ignore_features=['CLASSE_RENDIMENTO', 'Q024', 'Q025'],
             target='CLASSE_RENDIMENTO_GERAL',
             train_size=0.8,
             fold=5,
             fix_imbalance=False,
             use_gpu=True,
             silent=True,
             session_id=7)

Unnamed: 0,Description,Value
0,session_id,7
1,Target,CLASSE_RENDIMENTO_GERAL
2,Target Type,Multiclass
3,Label Encoded,"0.0: 0, 1.0: 1, 2.0: 2"
4,Original Data,"(325364, 9)"
5,Missing Values,True
6,Numeric Features,0
7,Categorical Features,5
8,Ordinal Features,False
9,High Cardinality Features,False


In [253]:
compare_models(include=['lr', 'dt', 
                        'rf', 'nb',
                        'knn', 'lightgbm'], fold=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7668,0.6697,0.3992,0.7164,0.6936,0.1307,0.1957,4.074
rf,Random Forest Classifier,0.7663,0.6676,0.3985,0.709,0.693,0.1289,0.193,7.732
dt,Decision Tree Classifier,0.7661,0.6674,0.3988,0.707,0.6933,0.1295,0.1928,0.362
lr,Logistic Regression,0.7643,0.664,0.3888,0.6934,0.687,0.1106,0.1742,15.888
knn,K Neighbors Classifier,0.7419,0.5911,0.4158,0.6772,0.6923,0.1405,0.163,82.422
nb,Naive Bayes,0.4228,0.6463,0.5727,0.6822,0.4514,0.1484,0.1991,0.238


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               device='gpu', importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
               objective=None, random_state=7, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [254]:
nb = create_model('nb',
                  fold=5)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.4194,0.6471,0.5743,0.6825,0.447,0.1481,0.1999
1,0.429,0.6529,0.5818,0.6889,0.4575,0.1566,0.2097
2,0.4172,0.6431,0.5701,0.6809,0.445,0.1455,0.1965
3,0.4239,0.6467,0.5719,0.6814,0.4532,0.1478,0.198
4,0.4244,0.6416,0.5653,0.6774,0.4546,0.1441,0.1916
Mean,0.4228,0.6463,0.5727,0.6822,0.4514,0.1484,0.1991
SD,0.0041,0.0039,0.0054,0.0038,0.0047,0.0043,0.0059


In [256]:
nb

GaussianNB(priors=None, var_smoothing=1e-09)

In [258]:
tuned_nb = tune_model(nb,
                      n_iter=300,
                      fold=5,
                      optimize='F1')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.744,0.65,0.44,0.6409,0.6881,0.1731,0.1959
1,0.7471,0.6549,0.4309,0.6403,0.6884,0.1629,0.1894
2,0.7429,0.6451,0.4327,0.6385,0.6861,0.1626,0.1856
3,0.747,0.6476,0.4356,0.6414,0.6893,0.1701,0.1956
4,0.744,0.646,0.433,0.6392,0.6868,0.164,0.1878
Mean,0.745,0.6487,0.4344,0.6401,0.6877,0.1665,0.1908
SD,0.0017,0.0035,0.0031,0.0011,0.0011,0.0043,0.0042


In [259]:
tuned_nb

GaussianNB(priors=None, var_smoothing=1)

#### Experimento 2 - Dados do RJ, 2019, features do artigo + (computador e internet) e target média

In [260]:
from pycaret.classification import *

init = setup(data=microdados_rj,
             ignore_features=['CLASSE_RENDIMENTO'],
             target='CLASSE_RENDIMENTO_GERAL',
             train_size=0.8,
             fold=5,
             fix_imbalance=False,
             use_gpu=True,
             silent=True,
             session_id=7)

Unnamed: 0,Description,Value
0,session_id,7
1,Target,CLASSE_RENDIMENTO_GERAL
2,Target Type,Multiclass
3,Label Encoded,"0.0: 0, 1.0: 1, 2.0: 2"
4,Original Data,"(325364, 9)"
5,Missing Values,True
6,Numeric Features,0
7,Categorical Features,7
8,Ordinal Features,False
9,High Cardinality Features,False


In [261]:
compare_models(include=['lr', 'dt', 
                        'rf', 'nb',
                        'knn', 'lightgbm'], fold=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7669,0.6748,0.4005,0.72,0.6941,0.1328,0.1977,4.49
rf,Random Forest Classifier,0.766,0.6669,0.4008,0.71,0.6947,0.1331,0.1949,9.332
dt,Decision Tree Classifier,0.7653,0.6665,0.4011,0.7045,0.695,0.1332,0.1926,0.406
lr,Logistic Regression,0.7645,0.6696,0.3888,0.7206,0.6873,0.1108,0.175,19.332
knn,K Neighbors Classifier,0.736,0.5913,0.4196,0.6763,0.6923,0.1425,0.1596,89.348
nb,Naive Bayes,0.4248,0.652,0.5838,0.6871,0.4509,0.1559,0.2086,0.236


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               device='gpu', importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
               objective=None, random_state=7, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [262]:
nb = create_model('nb',
                  fold=5)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.4211,0.6518,0.5845,0.6854,0.4461,0.1544,0.2076
1,0.4277,0.6578,0.5909,0.6923,0.4533,0.1615,0.2165
2,0.4168,0.6503,0.5829,0.6874,0.4412,0.1533,0.2076
3,0.43,0.6532,0.5828,0.6857,0.4576,0.1565,0.2075
4,0.4284,0.6468,0.5777,0.6846,0.4566,0.1539,0.204
Mean,0.4248,0.652,0.5838,0.6871,0.4509,0.1559,0.2086
SD,0.005,0.0036,0.0043,0.0028,0.0063,0.003,0.0042


In [263]:
nb

GaussianNB(priors=None, var_smoothing=1e-09)

In [264]:
tuned_nb = tune_model(nb,
                      n_iter=300,
                      fold=5,
                      optimize='F1')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7302,0.6565,0.4651,0.6909,0.6894,0.1966,0.2087
1,0.7332,0.6605,0.4651,0.6925,0.6907,0.1987,0.212
2,0.7252,0.6529,0.462,0.6846,0.6882,0.1893,0.1994
3,0.7304,0.6555,0.4657,0.6904,0.6925,0.1981,0.2094
4,0.7264,0.6519,0.4624,0.6843,0.6885,0.1907,0.2012
Mean,0.7291,0.6555,0.4641,0.6886,0.6898,0.1947,0.2061
SD,0.0029,0.003,0.0015,0.0034,0.0016,0.0039,0.0049


In [265]:
tuned_nb

GaussianNB(priors=None, var_smoothing=1)

#### Experimento 3 - Dados do RJ, 2019, mesmas features do artigo e target nota de matemática

In [266]:
from pycaret.classification import *

init = setup(data=microdados_rj,
             ignore_features=['CLASSE_RENDIMENTO_GERAL', 'Q024', 'Q025'],
             target='CLASSE_RENDIMENTO',
             train_size=0.8,
             fold=5,
             fix_imbalance=False,
             use_gpu=True,
             silent=True,
             session_id=7)

Unnamed: 0,Description,Value
0,session_id,7
1,Target,CLASSE_RENDIMENTO
2,Target Type,Multiclass
3,Label Encoded,"EXCELENTE: 0, INSUFICIENTE: 1, REGULAR: 2"
4,Original Data,"(325364, 9)"
5,Missing Values,True
6,Numeric Features,0
7,Categorical Features,5
8,Ordinal Features,False
9,High Cardinality Features,False


In [267]:
compare_models(include=['lr', 'dt', 
                        'rf', 'nb',
                        'knn', 'lightgbm'], fold=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.5787,0.6555,0.4373,0.5676,0.5105,0.1568,0.1965,4.096
rf,Random Forest Classifier,0.5785,0.6533,0.4358,0.5668,0.5081,0.1545,0.1953,7.456
dt,Decision Tree Classifier,0.5782,0.6531,0.4384,0.5647,0.5121,0.1581,0.1961,0.356
lr,Logistic Regression,0.5766,0.6547,0.4333,0.5682,0.4975,0.1483,0.192,10.278
knn,K Neighbors Classifier,0.499,0.5903,0.4502,0.4914,0.4939,0.1394,0.1399,82.998
nb,Naive Bayes,0.4158,0.6413,0.5209,0.5081,0.3755,0.1776,0.2159,0.24


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               device='gpu', importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
               objective=None, random_state=7, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [268]:
nb = create_model('nb',
                  fold=5)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.4102,0.6459,0.5239,0.5106,0.3613,0.1776,0.2216
1,0.4195,0.6445,0.5224,0.5094,0.3821,0.1801,0.2171
2,0.4127,0.6348,0.5158,0.4998,0.3739,0.1716,0.2073
3,0.4126,0.6405,0.5201,0.5099,0.3694,0.1762,0.2168
4,0.4241,0.641,0.5221,0.5106,0.3909,0.1824,0.2168
Mean,0.4158,0.6413,0.5209,0.5081,0.3755,0.1776,0.2159
SD,0.0052,0.0039,0.0028,0.0042,0.0102,0.0037,0.0046


In [269]:
nb

GaussianNB(priors=None, var_smoothing=1e-09)

In [270]:
tuned_nb = tune_model(nb,
                      n_iter=300,
                      fold=5,
                      optimize='F1')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.559,0.6511,0.4809,0.5426,0.5398,0.1999,0.2065
1,0.5545,0.6476,0.479,0.5397,0.5387,0.1959,0.201
2,0.549,0.6392,0.4684,0.5316,0.5294,0.1803,0.1864
3,0.5495,0.6413,0.4725,0.5335,0.5334,0.187,0.1918
4,0.5516,0.6417,0.47,0.5336,0.5309,0.1826,0.1894
Mean,0.5527,0.6442,0.4742,0.5362,0.5344,0.1891,0.195
SD,0.0037,0.0044,0.0049,0.0042,0.0041,0.0076,0.0076


In [271]:
tuned_nb

GaussianNB(priors=None, var_smoothing=1)

#### Experimento 4 - Dados do RJ, 2019, features do artigo + (computador e internet) e target nota de matemática

In [272]:
from pycaret.classification import *

init = setup(data=microdados_rj,
             ignore_features=['CLASSE_RENDIMENTO_GERAL'],
             target='CLASSE_RENDIMENTO',
             train_size=0.8,
             fold=5,
             fix_imbalance=False,
             use_gpu=True,
             silent=True,
             session_id=7)

Unnamed: 0,Description,Value
0,session_id,7
1,Target,CLASSE_RENDIMENTO
2,Target Type,Multiclass
3,Label Encoded,"EXCELENTE: 0, INSUFICIENTE: 1, REGULAR: 2"
4,Original Data,"(325364, 9)"
5,Missing Values,True
6,Numeric Features,0
7,Categorical Features,7
8,Ordinal Features,False
9,High Cardinality Features,False


In [275]:
compare_models(include=['lr', 'dt', 
                        'rf', 'nb',
                        'knn', 'lightgbm'], fold=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.5804,0.6591,0.4394,0.5706,0.5145,0.1609,0.2003,4.502
lr,Logistic Regression,0.5785,0.6585,0.4436,0.567,0.5149,0.1642,0.2002,11.354
rf,Random Forest Classifier,0.5773,0.6516,0.4425,0.5614,0.5184,0.1636,0.1966,9.534
dt,Decision Tree Classifier,0.5761,0.6508,0.4431,0.5587,0.5187,0.1637,0.195,0.418
knn,K Neighbors Classifier,0.5063,0.5982,0.4489,0.4941,0.4971,0.1418,0.143,90.842
nb,Naive Bayes,0.4158,0.6456,0.5287,0.5108,0.3713,0.1839,0.2232,0.25


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               device='gpu', importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
               objective=None, random_state=7, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [276]:
nb = create_model('nb',
                  fold=5)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.4157,0.6507,0.5321,0.5149,0.3683,0.1864,0.2282
1,0.4157,0.6487,0.5306,0.5119,0.3698,0.1851,0.2254
2,0.4132,0.6398,0.5244,0.502,0.3704,0.1784,0.2152
3,0.4154,0.645,0.529,0.515,0.3697,0.1849,0.2257
4,0.419,0.6436,0.5276,0.5102,0.3785,0.1847,0.2217
Mean,0.4158,0.6456,0.5287,0.5108,0.3713,0.1839,0.2232
SD,0.0019,0.0038,0.0027,0.0048,0.0036,0.0028,0.0045


In [277]:
nb

GaussianNB(priors=None, var_smoothing=1e-09)

In [278]:
tuned_nb = tune_model(nb,
                      n_iter=300,
                      fold=5,
                      optimize='F1')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.5549,0.656,0.4957,0.5426,0.5401,0.2119,0.2163
1,0.5574,0.6528,0.4953,0.5448,0.5426,0.2136,0.2181
2,0.5457,0.6441,0.4857,0.5335,0.5324,0.1966,0.2002
3,0.5516,0.6479,0.4896,0.5384,0.5377,0.2044,0.2083
4,0.551,0.6462,0.4877,0.5375,0.5353,0.2004,0.2051
Mean,0.5521,0.6494,0.4908,0.5394,0.5376,0.2054,0.2096
SD,0.004,0.0044,0.004,0.004,0.0036,0.0065,0.0068


In [279]:
tuned_nb

GaussianNB(priors=None, var_smoothing=1)