In [1]:
import pandas as pd
import numpy as np
from unidecode import unidecode

from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder

  from .autonotebook import tqdm as notebook_tqdm


#### Funções

In [2]:
def remove_acentos(texto):
    return unidecode(str(texto))

In [3]:
def minusculo(texto):
    texto = texto.replace(' ', '_')
    return str(texto).lower()

#### Leitura

In [4]:
df_2021 = pd.read_csv('dados/microdados_2021/dados/MICRODADOS_CADASTRO_CURSOS_2021.CSV',encoding='ISO-8859-1',sep=';')

  df_2021 = pd.read_csv('dados/microdados_2021/dados/MICRODADOS_CADASTRO_CURSOS_2021.CSV',encoding='ISO-8859-1',sep=';')


In [5]:
df_indicadores = pd.read_csv('dados_processados/indicadores.csv', sep=',')

In [9]:
profile = ProfileReport(df_2021, title='Relatório de Dados Mínimo', minimal=True)

In [10]:
profile.to_file('microdados.html')

Summarize dataset: 100%|██████████████████████████████████████████████████| 206/206 [00:03<00:00, 55.96it/s, Completed]
Generate report structure: 100%|█████████████████████████████████████████████████████████| 1/1 [00:43<00:00, 43.25s/it]
Render HTML: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.66s/it]
Export report to file: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.24it/s]


#### Limpeza e ajuste de dados

In [6]:
df_2021['TP_MODALIDADE_ENSINO'].unique()

array([2, 1])

In [7]:
df_2021 = df_2021.query('TP_MODALIDADE_ENSINO == 1').reset_index(drop=True).copy()

In [8]:
df_2021['TP_MODALIDADE_ENSINO'].unique()

array([1])

In [9]:
df_2021

Unnamed: 0,NU_ANO_CENSO,NO_REGIAO,CO_REGIAO,NO_UF,SG_UF,CO_UF,NO_MUNICIPIO,CO_MUNICIPIO,IN_CAPITAL,TP_DIMENSAO,...,QT_MAT_APOIO_SOCIAL,QT_CONC_APOIO_SOCIAL,QT_ATIV_EXTRACURRICULAR,QT_ING_ATIV_EXTRACURRICULAR,QT_MAT_ATIV_EXTRACURRICULAR,QT_CONC_ATIV_EXTRACURRICULAR,QT_MOB_ACADEMICA,QT_ING_MOB_ACADEMICA,QT_MAT_MOB_ACADEMICA,QT_CONC_MOB_ACADEMICA
0,2021,Centro-Oeste,5.0,Distrito Federal,DF,53.0,Brasília,5300108.0,1.0,1,...,0,0,0,0,0,0,0,0,0,0
1,2021,Centro-Oeste,5.0,Distrito Federal,DF,53.0,Brasília,5300108.0,1.0,1,...,0,0,1,0,0,0,0,0,0,0
2,2021,Centro-Oeste,5.0,Distrito Federal,DF,53.0,Brasília,5300108.0,1.0,1,...,26,0,43,1,3,0,0,0,0,0
3,2021,Centro-Oeste,5.0,Distrito Federal,DF,53.0,Brasília,5300108.0,1.0,1,...,0,0,0,0,0,0,0,0,0,0
4,2021,Centro-Oeste,5.0,Distrito Federal,DF,53.0,Brasília,5300108.0,1.0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35745,2021,Sul,4.0,Santa Catarina,SC,42.0,Xaxim,4219705.0,0.0,1,...,2,0,120,25,107,42,0,0,0,0
35746,2021,Sul,4.0,Santa Catarina,SC,42.0,Xaxim,4219705.0,0.0,1,...,0,0,6,0,1,1,0,0,0,0
35747,2021,Sul,4.0,Santa Catarina,SC,42.0,Xaxim,4219705.0,0.0,1,...,0,0,12,1,11,1,0,0,0,0
35748,2021,Sul,4.0,Santa Catarina,SC,42.0,Xaxim,4219705.0,0.0,1,...,0,0,8,0,6,6,0,0,0,0


#### Ajustes dataframe microdados

In [10]:
df_2021.replace({'TP_GRAU_ACADEMICO':{1:'bacharelado',2:'licenciatura',3:'tecnologico',4:'bacharelado e licenciatura'}},inplace=True)

In [11]:
df_2021.columns = df_2021.columns.str.lower()

In [12]:
colunas_categoricas = df_2021.select_dtypes(include=['object']).columns
for coluna in colunas_categoricas:
    df_2021[coluna] = df_2021[coluna].apply(remove_acentos)
    df_2021[coluna] = df_2021[coluna].apply(minusculo)

In [13]:
df_2021

Unnamed: 0,nu_ano_censo,no_regiao,co_regiao,no_uf,sg_uf,co_uf,no_municipio,co_municipio,in_capital,tp_dimensao,...,qt_mat_apoio_social,qt_conc_apoio_social,qt_ativ_extracurricular,qt_ing_ativ_extracurricular,qt_mat_ativ_extracurricular,qt_conc_ativ_extracurricular,qt_mob_academica,qt_ing_mob_academica,qt_mat_mob_academica,qt_conc_mob_academica
0,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,0,0,0,0,0,0,0,0,0,0
1,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,0,0,1,0,0,0,0,0,0,0
2,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,26,0,43,1,3,0,0,0,0,0
3,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,0,0,0,0,0,0,0,0,0,0
4,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35745,2021,sul,4.0,santa_catarina,sc,42.0,xaxim,4219705.0,0.0,1,...,2,0,120,25,107,42,0,0,0,0
35746,2021,sul,4.0,santa_catarina,sc,42.0,xaxim,4219705.0,0.0,1,...,0,0,6,0,1,1,0,0,0,0
35747,2021,sul,4.0,santa_catarina,sc,42.0,xaxim,4219705.0,0.0,1,...,0,0,12,1,11,1,0,0,0,0
35748,2021,sul,4.0,santa_catarina,sc,42.0,xaxim,4219705.0,0.0,1,...,0,0,8,0,6,6,0,0,0,0


In [14]:
print(df_indicadores.dtypes)

tp_grau_academico    object
co_curso              int64
enade_faixa           int64
cpc_faixa             int64
dtype: object


In [15]:
df_indicadores.columns

Index(['tp_grau_academico', 'co_curso', 'enade_faixa', 'cpc_faixa'], dtype='object')

In [16]:
#Para evitar enade_faixa e cpc_faixa NaN não foi utilizado o left no merge
resultado = pd.merge(df_2021, df_indicadores[['co_curso', 'enade_faixa', 'cpc_faixa']], on='co_curso')

In [17]:
resultado

Unnamed: 0,nu_ano_censo,no_regiao,co_regiao,no_uf,sg_uf,co_uf,no_municipio,co_municipio,in_capital,tp_dimensao,...,qt_ativ_extracurricular,qt_ing_ativ_extracurricular,qt_mat_ativ_extracurricular,qt_conc_ativ_extracurricular,qt_mob_academica,qt_ing_mob_academica,qt_mat_mob_academica,qt_conc_mob_academica,enade_faixa,cpc_faixa
0,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,73,1,73,4,0,0,0,0,4,4
1,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,31,0,31,5,0,0,0,0,3,3
2,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,88,0,86,10,0,0,0,0,4,4
3,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,113,3,111,24,0,0,0,0,5,4
4,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,104,0,103,28,0,0,0,0,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,2021,sul,4.0,santa_catarina,sc,42.0,sao_jose,4216602.0,0.0,1,...,83,2,81,50,0,0,0,0,3,3
3201,2021,sul,4.0,santa_catarina,sc,42.0,sao_jose,4216602.0,0.0,1,...,23,3,22,1,0,0,0,0,4,4
3202,2021,sul,4.0,santa_catarina,sc,42.0,tubarao,4218707.0,0.0,1,...,0,0,0,0,0,0,0,0,4,4
3203,2021,sul,4.0,santa_catarina,sc,42.0,videira,4219309.0,0.0,1,...,19,4,19,4,0,0,0,0,3,3


In [18]:
resultado.shape

(3205, 202)

In [19]:
resultado.describe()

Unnamed: 0,nu_ano_censo,co_regiao,co_uf,co_municipio,in_capital,tp_dimensao,tp_organizacao_academica,tp_categoria_administrativa,tp_rede,co_ies,...,qt_ativ_extracurricular,qt_ing_ativ_extracurricular,qt_mat_ativ_extracurricular,qt_conc_ativ_extracurricular,qt_mob_academica,qt_ing_mob_academica,qt_mat_mob_academica,qt_conc_mob_academica,enade_faixa,cpc_faixa
count,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,...,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0
mean,2021.0,2.945086,32.699844,3283709.0,0.251794,1.0,1.613729,1.475195,1.0,2518.545398,...,21.930421,1.107956,20.444618,3.395632,0.042434,0.000936,0.04181,0.006552,3.205616,3.513885
std,0.0,1.172981,10.732144,1074156.0,0.434111,0.0,1.176364,0.83824,0.0,5038.930842,...,33.669575,3.026034,31.942566,5.760198,0.33993,0.030585,0.339088,0.091564,0.978147,0.628751
min,2021.0,1.0,11.0,1100023.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,2021.0,2.0,25.0,2504009.0,0.0,1.0,1.0,1.0,1.0,367.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0
50%,2021.0,3.0,31.0,3162500.0,0.0,1.0,1.0,1.0,1.0,589.0,...,11.0,0.0,10.0,1.0,0.0,0.0,0.0,0.0,3.0,4.0
75%,2021.0,4.0,41.0,4115705.0,1.0,1.0,1.0,2.0,1.0,1810.0,...,31.0,1.0,29.0,4.0,0.0,0.0,0.0,0.0,4.0,4.0
max,2021.0,5.0,53.0,5300108.0,1.0,1.0,5.0,7.0,1.0,25352.0,...,500.0,65.0,490.0,50.0,11.0,1.0,11.0,3.0,5.0,5.0


In [20]:
resultado['enade_faixa'].unique()

array([4, 3, 5, 2, 1])

In [21]:
print(resultado.dtypes)

nu_ano_censo               int64
no_regiao                 object
co_regiao                float64
no_uf                     object
sg_uf                     object
                          ...   
qt_ing_mob_academica       int64
qt_mat_mob_academica       int64
qt_conc_mob_academica      int64
enade_faixa                int64
cpc_faixa                  int64
Length: 202, dtype: object


In [22]:
if resultado.isnull().all().any():
    print('Há pelo menos uma coluna com dados ausentes.')
else:
    print('Não há colunas com dados ausentes.')

Não há colunas com dados ausentes.


In [23]:
df = resultado.drop(['no_regiao', 'no_uf', 'sg_uf', 'no_cine_rotulo', 'co_cine_rotulo', 'no_cine_area_especifica', 'no_cine_area_detalhada',
                     'co_cine_area_geral', 'co_cine_area_especifica', 'co_cine_area_detalhada', 'co_municipio'],axis=1)

In [24]:
df.shape

(3205, 191)

#### Tratamento