In [1]:
import pandas as pd
import numpy as np
from unidecode import unidecode

from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder

  from .autonotebook import tqdm as notebook_tqdm


#### Funções

In [2]:
def remove_acentos(texto):
    return unidecode(str(texto))

In [3]:
def minusculo(texto):
    texto = texto.replace(' ', '_')
    return str(texto).lower()

#### Leitura

In [4]:
df_2021 = pd.read_csv('dados/microdados_2021/dados/MICRODADOS_CADASTRO_CURSOS_2021.CSV',encoding='ISO-8859-1',sep=';')

  df_2021 = pd.read_csv('dados/microdados_2021/dados/MICRODADOS_CADASTRO_CURSOS_2021.CSV',encoding='ISO-8859-1',sep=';')


In [5]:
df_indicadores = pd.read_csv('dados_processados/indicadores.csv', sep=',')

In [6]:
#profile = ProfileReport(df_2021, title='Relatório de Dados Mínimo', minimal=True)

In [7]:
#profile.to_file('microdados.html')

#### Limpeza e ajuste de dados

In [8]:
df_2021.shape

(444786, 200)

In [9]:
df_2021 = df_2021.dropna()

In [10]:
df_2021.shape

(436533, 200)

In [11]:
df_2021['TP_MODALIDADE_ENSINO'].unique()

array([1, 2], dtype=int64)

In [12]:
df_2021 = df_2021.query('TP_MODALIDADE_ENSINO == 1').reset_index(drop=True).copy()

In [13]:
df_2021['TP_MODALIDADE_ENSINO'].unique()

array([1], dtype=int64)

In [14]:
df_2021

Unnamed: 0,NU_ANO_CENSO,NO_REGIAO,CO_REGIAO,NO_UF,SG_UF,CO_UF,NO_MUNICIPIO,CO_MUNICIPIO,IN_CAPITAL,TP_DIMENSAO,...,QT_MAT_APOIO_SOCIAL,QT_CONC_APOIO_SOCIAL,QT_ATIV_EXTRACURRICULAR,QT_ING_ATIV_EXTRACURRICULAR,QT_MAT_ATIV_EXTRACURRICULAR,QT_CONC_ATIV_EXTRACURRICULAR,QT_MOB_ACADEMICA,QT_ING_MOB_ACADEMICA,QT_MAT_MOB_ACADEMICA,QT_CONC_MOB_ACADEMICA
0,2021,Centro-Oeste,5.0,Distrito Federal,DF,53.0,Brasília,5300108.0,1.0,1,...,104,6,198,9,194,28,4,0,4,0
1,2021,Centro-Oeste,5.0,Distrito Federal,DF,53.0,Brasília,5300108.0,1.0,1,...,116,28,109,0,109,35,0,0,0,0
2,2021,Centro-Oeste,5.0,Distrito Federal,DF,53.0,Brasília,5300108.0,1.0,1,...,51,5,105,1,103,8,2,0,2,0
3,2021,Centro-Oeste,5.0,Distrito Federal,DF,53.0,Brasília,5300108.0,1.0,1,...,32,1,58,0,57,7,2,0,2,0
4,2021,Centro-Oeste,5.0,Distrito Federal,DF,53.0,Brasília,5300108.0,1.0,1,...,28,3,51,2,51,12,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35460,2021,Sul,4.0,Santa Catarina,SC,42.0,Xaxim,4219705.0,0.0,1,...,2,0,120,25,107,42,0,0,0,0
35461,2021,Sul,4.0,Santa Catarina,SC,42.0,Xaxim,4219705.0,0.0,1,...,0,0,6,0,1,1,0,0,0,0
35462,2021,Sul,4.0,Santa Catarina,SC,42.0,Xaxim,4219705.0,0.0,1,...,0,0,12,1,11,1,0,0,0,0
35463,2021,Sul,4.0,Santa Catarina,SC,42.0,Xaxim,4219705.0,0.0,1,...,0,0,8,0,6,6,0,0,0,0


#### Ajustes dataframe microdados

In [15]:
df_2021.replace({'TP_GRAU_ACADEMICO':{1:'bacharelado',2:'licenciatura',3:'tecnologico',4:'bacharelado e licenciatura'}},inplace=True)

In [16]:
df_2021.columns = df_2021.columns.str.lower()

In [17]:
colunas_categoricas = df_2021.select_dtypes(include=['object']).columns
for coluna in colunas_categoricas:
    df_2021[coluna] = df_2021[coluna].apply(remove_acentos)
    df_2021[coluna] = df_2021[coluna].apply(minusculo)

In [18]:
df_2021

Unnamed: 0,nu_ano_censo,no_regiao,co_regiao,no_uf,sg_uf,co_uf,no_municipio,co_municipio,in_capital,tp_dimensao,...,qt_mat_apoio_social,qt_conc_apoio_social,qt_ativ_extracurricular,qt_ing_ativ_extracurricular,qt_mat_ativ_extracurricular,qt_conc_ativ_extracurricular,qt_mob_academica,qt_ing_mob_academica,qt_mat_mob_academica,qt_conc_mob_academica
0,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,104,6,198,9,194,28,4,0,4,0
1,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,116,28,109,0,109,35,0,0,0,0
2,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,51,5,105,1,103,8,2,0,2,0
3,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,32,1,58,0,57,7,2,0,2,0
4,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,28,3,51,2,51,12,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35460,2021,sul,4.0,santa_catarina,sc,42.0,xaxim,4219705.0,0.0,1,...,2,0,120,25,107,42,0,0,0,0
35461,2021,sul,4.0,santa_catarina,sc,42.0,xaxim,4219705.0,0.0,1,...,0,0,6,0,1,1,0,0,0,0
35462,2021,sul,4.0,santa_catarina,sc,42.0,xaxim,4219705.0,0.0,1,...,0,0,12,1,11,1,0,0,0,0
35463,2021,sul,4.0,santa_catarina,sc,42.0,xaxim,4219705.0,0.0,1,...,0,0,8,0,6,6,0,0,0,0


In [19]:
print(df_indicadores.dtypes)

tp_grau_academico    object
co_curso              int64
enade_faixa           int64
cpc_faixa             int64
dtype: object


In [20]:
df_indicadores.columns

Index(['tp_grau_academico', 'co_curso', 'enade_faixa', 'cpc_faixa'], dtype='object')

In [21]:
#Para evitar enade_faixa e cpc_faixa NaN não foi utilizado o left no merge
resultado = pd.merge(df_2021, df_indicadores[['co_curso', 'enade_faixa', 'cpc_faixa']], on='co_curso')

In [22]:
resultado

Unnamed: 0,nu_ano_censo,no_regiao,co_regiao,no_uf,sg_uf,co_uf,no_municipio,co_municipio,in_capital,tp_dimensao,...,qt_ativ_extracurricular,qt_ing_ativ_extracurricular,qt_mat_ativ_extracurricular,qt_conc_ativ_extracurricular,qt_mob_academica,qt_ing_mob_academica,qt_mat_mob_academica,qt_conc_mob_academica,enade_faixa,cpc_faixa
0,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,73,1,73,4,0,0,0,0,4,4
1,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,31,0,31,5,0,0,0,0,3,3
2,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,88,0,86,10,0,0,0,0,4,4
3,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,113,3,111,24,0,0,0,0,5,4
4,2021,centro-oeste,5.0,distrito_federal,df,53.0,brasilia,5300108.0,1.0,1,...,104,0,103,28,0,0,0,0,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,2021,sul,4.0,santa_catarina,sc,42.0,sao_jose,4216602.0,0.0,1,...,83,2,81,50,0,0,0,0,3,3
3201,2021,sul,4.0,santa_catarina,sc,42.0,sao_jose,4216602.0,0.0,1,...,23,3,22,1,0,0,0,0,4,4
3202,2021,sul,4.0,santa_catarina,sc,42.0,tubarao,4218707.0,0.0,1,...,0,0,0,0,0,0,0,0,4,4
3203,2021,sul,4.0,santa_catarina,sc,42.0,videira,4219309.0,0.0,1,...,19,4,19,4,0,0,0,0,3,3


In [23]:
resultado.shape

(3205, 202)

In [24]:
resultado.describe()

Unnamed: 0,nu_ano_censo,co_regiao,co_uf,co_municipio,in_capital,tp_dimensao,tp_organizacao_academica,tp_categoria_administrativa,tp_rede,co_ies,...,qt_ativ_extracurricular,qt_ing_ativ_extracurricular,qt_mat_ativ_extracurricular,qt_conc_ativ_extracurricular,qt_mob_academica,qt_ing_mob_academica,qt_mat_mob_academica,qt_conc_mob_academica,enade_faixa,cpc_faixa
count,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,...,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0,3205.0
mean,2021.0,2.945086,32.699844,3283709.0,0.251794,1.0,1.613729,1.475195,1.0,2518.545398,...,21.930421,1.107956,20.444618,3.395632,0.042434,0.000936,0.04181,0.006552,3.205616,3.513885
std,0.0,1.172981,10.732144,1074156.0,0.434111,0.0,1.176364,0.83824,0.0,5038.930842,...,33.669575,3.026034,31.942566,5.760198,0.33993,0.030585,0.339088,0.091564,0.978147,0.628751
min,2021.0,1.0,11.0,1100023.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,2021.0,2.0,25.0,2504009.0,0.0,1.0,1.0,1.0,1.0,367.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0
50%,2021.0,3.0,31.0,3162500.0,0.0,1.0,1.0,1.0,1.0,589.0,...,11.0,0.0,10.0,1.0,0.0,0.0,0.0,0.0,3.0,4.0
75%,2021.0,4.0,41.0,4115705.0,1.0,1.0,1.0,2.0,1.0,1810.0,...,31.0,1.0,29.0,4.0,0.0,0.0,0.0,0.0,4.0,4.0
max,2021.0,5.0,53.0,5300108.0,1.0,1.0,5.0,7.0,1.0,25352.0,...,500.0,65.0,490.0,50.0,11.0,1.0,11.0,3.0,5.0,5.0


In [25]:
resultado['enade_faixa'].unique()

array([4, 3, 5, 2, 1], dtype=int64)

In [26]:
print(resultado.dtypes)

nu_ano_censo               int64
no_regiao                 object
co_regiao                float64
no_uf                     object
sg_uf                     object
                          ...   
qt_ing_mob_academica       int64
qt_mat_mob_academica       int64
qt_conc_mob_academica      int64
enade_faixa                int64
cpc_faixa                  int64
Length: 202, dtype: object


In [27]:
if resultado.isnull().all().any():
    print('Há pelo menos uma coluna com dados ausentes.')
else:
    print('Não há colunas com dados ausentes.')

Não há colunas com dados ausentes.


In [28]:
df = resultado.drop(['no_regiao', 'no_uf', 'sg_uf', 'no_cine_rotulo', 'co_cine_rotulo', 'no_cine_area_especifica', 'no_cine_area_detalhada',
                     'co_cine_area_geral', 'co_cine_area_especifica', 'co_cine_area_detalhada', 'co_municipio'],axis=1)

In [29]:
df.shape

(3205, 191)

#### Tratamento

In [30]:
#Primeiro passo é identificar os outliers e substituí-los

In [31]:
def detect_outliers_iqr(data, threshold=1.5):
    """
    Detecta outliers em uma série de dados usando o método do intervalo interquartil (IQR).
    
    Parâmetros:
        data (pd.Series): A série de dados para verificar outliers.
        threshold (float): O fator multiplicativo do IQR usado para definir o limite para outliers.
        
    Retorna:
        outliers (int): O número de amostras com outliers.
    """
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - threshold * iqr
    upper_bound = q3 + threshold * iqr
    
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return len(outliers)

In [32]:
# Calcula a quantidade de amostras com outliers para cada coluna numérica
outliers_count = {}
for column in df.select_dtypes(include=['number']):
    outliers_count[column] = detect_outliers_iqr(df[column])

# Exibe a quantidade de amostras com outliers por coluna
for column, count in outliers_count.items():
    print(f"Coluna '{column}': {count} amostras com outliers")

Coluna 'nu_ano_censo': 0 amostras com outliers
Coluna 'co_regiao': 0 amostras com outliers
Coluna 'co_uf': 0 amostras com outliers
Coluna 'in_capital': 0 amostras com outliers
Coluna 'tp_dimensao': 0 amostras com outliers
Coluna 'tp_organizacao_academica': 708 amostras com outliers
Coluna 'tp_categoria_administrativa': 44 amostras com outliers
Coluna 'tp_rede': 0 amostras com outliers
Coluna 'co_ies': 490 amostras com outliers
Coluna 'co_curso': 47 amostras com outliers
Coluna 'in_gratuito': 98 amostras com outliers
Coluna 'tp_modalidade_ensino': 0 amostras com outliers
Coluna 'tp_nivel_academico': 0 amostras com outliers
Coluna 'qt_curso': 0 amostras com outliers
Coluna 'qt_vg_total': 188 amostras com outliers
Coluna 'qt_vg_total_diurno': 100 amostras com outliers
Coluna 'qt_vg_total_noturno': 77 amostras com outliers
Coluna 'qt_vg_total_ead': 0 amostras com outliers
Coluna 'qt_vg_nova': 140 amostras com outliers
Coluna 'qt_vg_proc_seletivo': 0 amostras com outliers
Coluna 'qt_vg_rema

In [33]:
def replace_outliers_iqr(data, threshold=1.5):
    """
    Substitui outliers em uma série de dados pelos limites inferior ou superior do IQR.
    
    Parâmetros:
        data (pd.Series): A série de dados para substituir outliers.
        threshold (float): O fator multiplicativo do IQR usado para definir o limite para outliers.
        
    Retorna:
        data_with_replaced_outliers (pd.Series): A série de dados com outliers substituídos.
    """
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - threshold * iqr
    upper_bound = q3 + threshold * iqr
    
    data_with_replaced_outliers = data.copy()
    data_with_replaced_outliers[data_with_replaced_outliers < lower_bound] = lower_bound
    data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
    
    return data_with_replaced_outliers

In [34]:
# Substitui os outliers por limites do IQR para cada coluna numérica
for column in df.select_dtypes(include=['number']):
    df[column] = replace_outliers_iqr(df[column])

  data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
  data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
  data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
  data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
  data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
  data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
  data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
  data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
  data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
  data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
  data_with_replaced_outliers[data_with_replaced_outliers > upper_bound] = upper_bound
  data_with_replaced_outliers[data_with_rep

In [35]:
# Calcula a quantidade de amostras com outliers para cada coluna numérica
outliers_count = {}
for column in df.select_dtypes(include=['number']):
    outliers_count[column] = detect_outliers_iqr(df[column])

# Exibe a quantidade de amostras com outliers por coluna
for column, count in outliers_count.items():
    print(f"Coluna '{column}': {count} amostras com outliers")

Coluna 'nu_ano_censo': 0 amostras com outliers
Coluna 'co_regiao': 0 amostras com outliers
Coluna 'co_uf': 0 amostras com outliers
Coluna 'in_capital': 0 amostras com outliers
Coluna 'tp_dimensao': 0 amostras com outliers
Coluna 'tp_organizacao_academica': 0 amostras com outliers
Coluna 'tp_categoria_administrativa': 0 amostras com outliers
Coluna 'tp_rede': 0 amostras com outliers
Coluna 'co_ies': 0 amostras com outliers
Coluna 'co_curso': 0 amostras com outliers
Coluna 'in_gratuito': 0 amostras com outliers
Coluna 'tp_modalidade_ensino': 0 amostras com outliers
Coluna 'tp_nivel_academico': 0 amostras com outliers
Coluna 'qt_curso': 0 amostras com outliers
Coluna 'qt_vg_total': 0 amostras com outliers
Coluna 'qt_vg_total_diurno': 0 amostras com outliers
Coluna 'qt_vg_total_noturno': 0 amostras com outliers
Coluna 'qt_vg_total_ead': 0 amostras com outliers
Coluna 'qt_vg_nova': 0 amostras com outliers
Coluna 'qt_vg_proc_seletivo': 0 amostras com outliers
Coluna 'qt_vg_remanesc': 0 amost

In [36]:
df

Unnamed: 0,nu_ano_censo,co_regiao,co_uf,no_municipio,in_capital,tp_dimensao,tp_organizacao_academica,tp_categoria_administrativa,tp_rede,co_ies,...,qt_ativ_extracurricular,qt_ing_ativ_extracurricular,qt_mat_ativ_extracurricular,qt_conc_ativ_extracurricular,qt_mob_academica,qt_ing_mob_academica,qt_mat_mob_academica,qt_conc_mob_academica,enade_faixa,cpc_faixa
0,2021,5.0,53.0,brasilia,1.0,1,1,1.0,1,2.0,...,73,1.0,72.5,4,0,0,0,0,4.0,4.0
1,2021,5.0,53.0,brasilia,1.0,1,1,1.0,1,2.0,...,31,0.0,31.0,5,0,0,0,0,3.0,3.0
2,2021,5.0,53.0,brasilia,1.0,1,1,1.0,1,2.0,...,76,0.0,72.5,10,0,0,0,0,4.0,4.0
3,2021,5.0,53.0,brasilia,1.0,1,1,1.0,1,2.0,...,76,2.5,72.5,10,0,0,0,0,5.0,4.0
4,2021,5.0,53.0,brasilia,1.0,1,1,1.0,1,2.0,...,76,0.0,72.5,10,0,0,0,0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,2021,4.0,42.0,sao_jose,0.0,1,1,3.0,1,3974.5,...,76,2.0,72.5,10,0,0,0,0,3.0,3.0
3201,2021,4.0,42.0,sao_jose,0.0,1,1,1.0,1,3162.0,...,23,2.5,22.0,1,0,0,0,0,4.0,4.0
3202,2021,4.0,42.0,tubarao,0.0,1,1,1.0,1,3162.0,...,0,0.0,0.0,0,0,0,0,0,4.0,4.0
3203,2021,4.0,42.0,videira,0.0,1,1,1.0,1,3974.5,...,19,2.5,19.0,4,0,0,0,0,3.0,3.0


In [37]:
df.shape

(3205, 191)

In [38]:
df1 = df.drop(['no_curso','no_municipio','no_cine_area_geral','tp_grau_academico','enade_faixa', 'cpc_faixa'],axis=1)

In [39]:
# Remover todas as colunas com variância zero

In [40]:
sel = VarianceThreshold(threshold=(0))
df_selected = sel.fit_transform(df1)
cols_selected = df1.columns[sel.get_support(indices=True)]
df_selected = pd.DataFrame(df_selected, columns=cols_selected)

In [41]:
df_selected

Unnamed: 0,co_regiao,co_uf,in_capital,tp_categoria_administrativa,co_ies,co_curso,qt_vg_total,qt_vg_total_diurno,qt_vg_total_noturno,qt_vg_nova,...,qt_conc_procescpublica,qt_conc_procescprivada,qt_apoio_social,qt_ing_apoio_social,qt_mat_apoio_social,qt_conc_apoio_social,qt_ativ_extracurricular,qt_ing_ativ_extracurricular,qt_mat_ativ_extracurricular,qt_conc_ativ_extracurricular
0,5.0,53.0,1.0,1.0,2.0,151.0,20.0,20.0,0.0,0.0,...,8.0,6.0,20.0,1.0,20.0,1.0,73.0,1.0,72.5,4.0
1,5.0,53.0,1.0,1.0,2.0,52159.0,10.0,0.0,10.0,10.0,...,3.0,5.0,11.0,0.0,11.0,1.0,31.0,0.0,31.0,5.0
2,5.0,53.0,1.0,1.0,2.0,127.0,84.0,84.0,0.0,80.0,...,12.0,15.0,31.0,3.0,31.0,0.0,76.0,0.0,72.5,10.0
3,5.0,53.0,1.0,1.0,2.0,160.0,92.0,0.0,92.0,80.0,...,22.0,15.0,67.5,4.0,62.5,7.5,76.0,2.5,72.5,10.0
4,5.0,53.0,1.0,1.0,2.0,44382.0,84.0,84.0,0.0,80.0,...,18.0,15.0,59.0,5.0,58.0,5.0,76.0,0.0,72.5,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,4.0,42.0,0.0,3.0,3974.5,96874.0,80.0,0.0,80.0,80.0,...,31.5,15.0,0.0,0.0,0.0,0.0,76.0,2.0,72.5,10.0
3201,4.0,42.0,0.0,1.0,3162.0,1102978.0,56.0,0.0,56.0,36.0,...,6.0,0.0,40.0,5.0,31.0,1.0,23.0,2.5,22.0,1.0
3202,4.0,42.0,0.0,1.0,3162.0,1363832.0,41.0,0.0,41.0,40.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3203,4.0,42.0,0.0,1.0,3974.5,1160123.0,89.0,89.0,0.0,40.0,...,19.0,6.0,8.0,5.0,8.0,0.0,19.0,2.5,19.0,4.0


In [42]:
df_selected.shape

(3205, 103)

In [43]:
# Padronizando os dados com StandardScaler

In [44]:
scaler = StandardScaler()
standard_data = scaler.fit_transform(df_selected.drop('co_curso', axis=1))

In [45]:
df_processed = pd.DataFrame(standard_data, columns=df_selected.drop('co_curso', axis=1).columns)

In [46]:
df_processed

Unnamed: 0,co_regiao,co_uf,in_capital,tp_categoria_administrativa,co_ies,qt_vg_total,qt_vg_total_diurno,qt_vg_total_noturno,qt_vg_nova,qt_vg_remanesc,...,qt_conc_procescpublica,qt_conc_procescprivada,qt_apoio_social,qt_ing_apoio_social,qt_mat_apoio_social,qt_conc_apoio_social,qt_ativ_extracurricular,qt_ing_ativ_extracurricular,qt_mat_ativ_extracurricular,qt_conc_ativ_extracurricular
0,1.752146,1.891824,1.723804,-0.735409,-0.901403,-1.005037,-0.212998,-0.873243,-1.543211,0.386979,...,-0.298582,0.430374,0.107162,-0.135733,0.193426,-0.293097,2.423857,0.464852,2.598061,0.386481
1,1.752146,1.891824,1.723804,-0.735409,-0.901403,-1.256279,-0.789470,-0.613992,-1.192717,-0.780552,...,-0.857126,0.218752,-0.314139,-0.653617,-0.260643,-0.293097,0.529363,-0.609302,0.622074,0.677181
2,1.752146,1.891824,1.723804,-0.735409,-0.901403,0.602910,1.631712,-0.873243,1.260738,-0.547046,...,0.148254,2.334966,0.622085,0.900036,0.748400,-0.688540,2.559178,-0.609302,2.598061,2.130682
3,1.752146,1.891824,1.723804,-0.735409,-0.901403,0.803903,-0.789470,1.511865,1.260738,-0.080033,...,1.265342,2.334966,2.330693,1.417921,2.337642,2.277284,2.559178,2.076084,2.598061,2.130682
4,1.752146,1.891824,1.723804,-0.735409,-0.901403,0.602910,1.631712,-0.873243,1.260738,-0.547046,...,0.818507,2.334966,1.932798,1.935805,2.110608,1.288676,2.559178,-0.609302,2.598061,2.130682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,0.899485,0.866705,-0.580112,2.707958,1.949733,0.502413,-0.789470,1.200764,1.260738,-0.780552,...,2.326576,2.334966,-0.829062,-0.653617,-0.815617,-0.688540,2.559178,1.539007,2.598061,2.130682
3201,0.899485,0.866705,-0.580112,-0.735409,1.366587,-0.100567,-0.789470,0.578562,-0.281434,0.386979,...,-0.521999,-0.839354,1.043385,1.935805,0.748400,-0.293097,0.168507,2.076084,0.193547,-0.485619
3202,0.899485,0.866705,-0.580112,-0.735409,1.366587,-0.477430,-0.789470,0.189686,-0.141236,-0.722175,...,-0.745417,-0.839354,-0.829062,-0.653617,-0.815617,-0.688540,-0.868953,-0.609302,-0.853965,-0.776319
3203,0.899485,0.866705,-0.580112,-0.735409,1.949733,0.728530,1.775830,-0.873243,-0.141236,2.079899,...,0.930216,0.430374,-0.454572,1.935805,-0.412000,-0.688540,-0.011921,2.076084,0.050704,0.386481


In [47]:
df_processed.shape

(3205, 102)

In [48]:
df_processed['co_curso'] = df['co_curso']
df_processed['no_curso'] = df['no_municipio'] + '_' + df['tp_grau_academico'] + '_' + df['no_curso']
df_processed['no_cine_area_geral'] = df['no_cine_area_geral']
df_processed['tp_grau_academico'] = df['tp_grau_academico']
df_processed['enade_faixa'] = df['enade_faixa']
df_processed['cpc_faixa'] = df['cpc_faixa']

In [49]:
# Calculando a correlação entre colunas e excluindo as que tem correlação maior que o limite determinado (0.8)

In [50]:
limit = 0.8
corr = df_processed.corr(numeric_only=True)
mask = np.triu(
    np.ones(corr.shape), k=1
).astype(bool)
corr_no_dial = corr.where(mask)
coll = [
    c
    for c in corr_no_dial.columns
    if any(abs(corr_no_dial[c])>limit)
]

In [51]:
coll

['co_uf',
 'qt_vg_nova',
 'qt_insc_vg_nova',
 'qt_ing_fem',
 'qt_ing_masc',
 'qt_ing_diurno',
 'qt_ing_vg_nova',
 'qt_ing_18_24',
 'qt_mat_diurno',
 'qt_mat_noturno',
 'qt_mat_18_24',
 'qt_mat_25_29',
 'qt_mat_30_34',
 'qt_mat_35_39',
 'qt_mat_40_49',
 'qt_conc_fem',
 'qt_conc_18_24',
 'qt_conc_25_29',
 'qt_ing_nacbras',
 'qt_mat_nacbras',
 'qt_conc_nacbras',
 'qt_mat_deficiente',
 'qt_ing_rvredepublica',
 'qt_ing_rvetnico',
 'qt_ing_rvsocial_rf',
 'qt_mat_reserva_vaga',
 'qt_mat_rvredepublica',
 'qt_mat_rvetnico',
 'qt_mat_rvsocial_rf',
 'qt_conc_rvredepublica',
 'qt_conc_rvetnico',
 'qt_ing_procescpublica',
 'qt_mat_procescpublica',
 'qt_mat_procescprivada',
 'qt_conc_procescpublica',
 'qt_mat_apoio_social',
 'qt_mat_ativ_extracurricular']

In [52]:
df_processed = df_processed.drop(coll, axis=1)

In [53]:
df_processed['co_curso'] = df['co_curso']

In [54]:
df_processed.shape

(3205, 71)

In [55]:
df_processed

Unnamed: 0,co_regiao,in_capital,tp_categoria_administrativa,co_ies,qt_vg_total,qt_vg_total_diurno,qt_vg_total_noturno,qt_vg_remanesc,qt_inscrito_total,qt_inscrito_total_diurno,...,qt_conc_apoio_social,qt_ativ_extracurricular,qt_ing_ativ_extracurricular,qt_conc_ativ_extracurricular,co_curso,no_curso,no_cine_area_geral,tp_grau_academico,enade_faixa,cpc_faixa
0,1.752146,1.723804,-0.735409,-0.901403,-1.005037,-0.212998,-0.873243,0.386979,-0.895389,-0.579250,...,-0.293097,2.423857,0.464852,0.386481,151,brasilia_licenciatura_artes_visuais,educacao,licenciatura,4.0,4.0
1,1.752146,1.723804,-0.735409,-0.901403,-1.256279,-0.789470,-0.613992,-0.780552,-0.949700,-0.683617,...,-0.293097,0.529363,-0.609302,0.677181,52159,brasilia_licenciatura_artes_visuais,educacao,licenciatura,3.0,3.0
2,1.752146,1.723804,-0.735409,-0.901403,0.602910,1.631712,-0.873243,-0.547046,2.279371,2.205103,...,-0.688540,2.559178,-0.609302,2.130682,127,brasilia_bacharelado_ciencia_da_computacao,computacao_e_tecnologias_da_informacao_e_comun...,bacharelado,4.0,4.0
3,1.752146,1.723804,-0.735409,-0.901403,0.803903,-0.789470,1.511865,-0.080033,0.200718,-0.683617,...,2.277284,2.559178,2.076084,2.130682,160,brasilia_licenciatura_ciencias_biologicas,educacao,licenciatura,5.0,4.0
4,1.752146,1.723804,-0.735409,-0.901403,0.602910,1.631712,-0.873243,-0.547046,2.279371,2.205103,...,1.288676,2.559178,-0.609302,2.130682,44382,brasilia_bacharelado_ciencias_biologicas,"ciencias_naturais,_matematica_e_estatistica",bacharelado,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,0.899485,-0.580112,2.707958,1.949733,0.502413,-0.789470,1.200764,-0.780552,-0.307836,-0.683617,...,-0.688540,2.559178,1.539007,2.130682,96874,sao_jose_licenciatura_pedagogia,educacao,licenciatura,3.0,3.0
3201,0.899485,-0.580112,-0.735409,1.366587,-0.100567,-0.789470,0.578562,0.386979,-0.110339,-0.683617,...,-0.293097,0.168507,2.076084,-0.485619,1102978,sao_jose_licenciatura_quimica,educacao,licenciatura,4.0,4.0
3202,0.899485,-0.580112,-0.735409,1.366587,-0.477430,-0.789470,0.189686,-0.722175,0.299466,-0.683617,...,-0.688540,-0.868953,-0.609302,-0.776319,1363832,tubarao_tecnologico_analise_e_desenvolvimento_...,computacao_e_tecnologias_da_informacao_e_comun...,tecnologico,4.0,4.0
3203,0.899485,-0.580112,-0.735409,1.949733,0.728530,1.775830,-0.873243,2.079899,-0.757141,-0.370517,...,-0.688540,-0.011921,2.076084,0.386481,1160123,videira_bacharelado_ciencia_da_computacao,computacao_e_tecnologias_da_informacao_e_comun...,bacharelado,3.0,3.0


In [56]:
# Codificando as colunas categóricas usando o OneHotEncoder

In [57]:
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [58]:
categoricas = ['no_cine_area_geral','tp_grau_academico', 'enade_faixa', 'cpc_faixa']

In [59]:
matriz_enc = enc.fit_transform(df_processed[categoricas])

In [60]:
df_enc = pd.DataFrame(matriz_enc, columns=enc.get_feature_names_out())

In [61]:
X = pd.concat([df_enc.reset_index(drop=True),df_processed.drop(columns=categoricas).reset_index(drop=True)],axis=1)

In [62]:
X

Unnamed: 0,no_cine_area_geral_artes_e_humanidades,"no_cine_area_geral_ciencias_naturais,_matematica_e_estatistica","no_cine_area_geral_ciencias_sociais,_comunicacao_e_informacao",no_cine_area_geral_computacao_e_tecnologias_da_informacao_e_comunicacao_(tic),no_cine_area_geral_educacao,no_cine_area_geral_saude_e_bem-estar,tp_grau_academico_bacharelado,tp_grau_academico_licenciatura,tp_grau_academico_tecnologico,enade_faixa_1.5,...,qt_ing_procescprivada,qt_conc_procescprivada,qt_apoio_social,qt_ing_apoio_social,qt_conc_apoio_social,qt_ativ_extracurricular,qt_ing_ativ_extracurricular,qt_conc_ativ_extracurricular,co_curso,no_curso
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-0.141161,0.430374,0.107162,-0.135733,-0.293097,2.423857,0.464852,0.386481,151,brasilia_licenciatura_artes_visuais
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-0.768070,0.218752,-0.314139,-0.653617,-0.293097,0.529363,-0.609302,0.677181,52159,brasilia_licenciatura_artes_visuais
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.948538,2.334966,0.622085,0.900036,-0.688540,2.559178,-0.609302,2.130682,127,brasilia_bacharelado_ciencia_da_computacao
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.635083,2.334966,2.330693,1.417921,2.277284,2.559178,2.076084,2.130682,160,brasilia_licenciatura_ciencias_biologicas
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,2.366478,2.334966,1.932798,1.935805,1.288676,2.559178,-0.609302,2.130682,44382,brasilia_bacharelado_ciencias_biologicas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.530598,2.334966,-0.829062,-0.653617,-0.688540,2.559178,1.539007,2.130682,96874,sao_jose_licenciatura_pedagogia
3201,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.694719,-0.839354,1.043385,1.935805,-0.293097,0.168507,2.076084,-0.485619,1102978,sao_jose_licenciatura_quimica
3202,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.245646,-0.839354,-0.829062,-0.653617,-0.688540,-0.868953,-0.609302,-0.776319,1363832,tubarao_tecnologico_analise_e_desenvolvimento_...
3203,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.559100,0.430374,-0.454572,1.935805,-0.688540,-0.011921,2.076084,0.386481,1160123,videira_bacharelado_ciencia_da_computacao


In [63]:
X.shape

(3205, 86)

In [64]:
X = X.drop('in_capital', axis=1)

In [65]:
X.shape

(3205, 85)

In [66]:
#X.columns

In [67]:
###'qt_vg_total_diurno','qt_vg_total_noturno','qt_inscrito_total_diurno', 'qt_ing', 'qt_ing_25_29', 'qt_ing_30_34', 'qt_ing_40_49', 'qt_ing_50_59', 'qt_mat_60_mais', 'qt_mat_cornd', 'qt_conc_diurno', 'qt_conc_30_34', 'qt_conc_preta', 'qt_mat_amarela', 'qt_mat_indigena'

In [68]:
# Dropando colunas que não serão utilizadas
X = X.drop(['qt_vg_total_diurno','qt_vg_total_noturno','qt_inscrito_total_diurno', 'qt_ing', 'qt_ing_25_29', 'qt_ing_30_34', 'qt_ing_40_49', 'qt_ing_50_59', 'qt_mat_60_mais', 'qt_mat_cornd', 'qt_conc_diurno', 'qt_conc_30_34', 'qt_conc_preta', 'qt_mat_amarela', 'qt_mat_indigena', 'co_regiao',
 'tp_categoria_administrativa',
 'co_ies',
 'qt_inscrito_total_noturno',
 'qt_ing_noturno',
 'qt_ing_vestibular',
 'qt_ing_enem',
 'qt_ing_vg_remanesc',
 'qt_ing_35_39',
 'qt_ing_branca',
 'qt_ing_parda',
 'qt_mat',
 'qt_mat_fem',
 'qt_mat_masc',
 'qt_mat_50_59',
 'qt_mat_branca',
 'qt_mat_preta',
 'qt_mat_parda',
 'qt_conc_masc',
 'qt_conc_noturno',
 'qt_conc_35_39',
 'qt_conc_40_49',
 'qt_conc_branca',
 'qt_conc_parda',
 'qt_conc_cornd',
 'qt_aluno_deficiente',
 'qt_mat_rvpdef',
 'qt_conc_reserva_vaga',
 'qt_conc_rvsocial_rf',
 'qt_conc_procescprivada',
 'qt_ing_apoio_social',
 'qt_ing_ativ_extracurricular',
 'qt_conc_ativ_extracurricular' ],axis=1)

In [69]:
X.columns

Index(['no_cine_area_geral_artes_e_humanidades',
       'no_cine_area_geral_ciencias_naturais,_matematica_e_estatistica',
       'no_cine_area_geral_ciencias_sociais,_comunicacao_e_informacao',
       'no_cine_area_geral_computacao_e_tecnologias_da_informacao_e_comunicacao_(tic)',
       'no_cine_area_geral_educacao', 'no_cine_area_geral_saude_e_bem-estar',
       'tp_grau_academico_bacharelado', 'tp_grau_academico_licenciatura',
       'tp_grau_academico_tecnologico', 'enade_faixa_1.5', 'enade_faixa_2.0',
       'enade_faixa_3.0', 'enade_faixa_4.0', 'enade_faixa_5.0',
       'cpc_faixa_1.5', 'cpc_faixa_2.0', 'cpc_faixa_3.0', 'cpc_faixa_4.0',
       'cpc_faixa_5.0', 'qt_vg_total', 'qt_vg_remanesc', 'qt_inscrito_total',
       'qt_insc_vg_remanesc', 'qt_ing_preta', 'qt_ing_amarela', 'qt_ing_cornd',
       'qt_conc', 'qt_ing_deficiente', 'qt_ing_reserva_vaga',
       'qt_sit_trancada', 'qt_sit_desvinculado', 'qt_ing_procescprivada',
       'qt_apoio_social', 'qt_conc_apoio_social', 'qt_a

In [70]:
X.shape

(3205, 37)

In [71]:
#Salvando os dados
X.to_csv('dados_processados/cursos-ufrpe-pos-processamento.csv',sep=';',index=False)