# Análise Global de Diversidade

In [173]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings("ignore")

In [120]:
ANOS = [2015, 2016, 2017, 2018, 2019]
UNIVERSIDADES = ['IFBA', 'UFBA', 'UEFS', 'UNEB', 'IFBAIANO', 'UESB', 'UNIVASF', 'UESC']

In [121]:
data_frame_diversidade = pd.read_csv('dataset/alunos_curso_2015_2019_tic_BA.csv')
data_frame_diversidade.head()

Unnamed: 0,NU_ANO_CENSO,CO_IES,TP_CATEGORIA_ADMINISTRATIVA,TP_ORGANIZACAO_ACADEMICA,CO_CURSO,CO_CURSO_POLO,TP_TURNO,TP_GRAU_ACADEMICO,TP_MODALIDADE_ENSINO,TP_NIVEL_ACADEMICO,...,CO_PAIS_DESTINO,IN_MATRICULA,IN_CONCLUINTE,IN_INGRESSO_TOTAL,IN_INGRESSO_VAGA_NOVA,IN_INGRESSO_PROCESSO_SELETIVO,NU_ANO_INGRESSO,CO_UF,NO_CURSO,SG_IES
0,2015,578,1.0,1.0,13277.0,,4.0,1.0,1.0,1.0,...,,1.0,0.0,0.0,0.0,,2013.0,29,CIÊNCIA DA COMPUTAÇÃO,UFBA
1,2015,578,1.0,1.0,13277.0,,4.0,1.0,1.0,1.0,...,,1.0,0.0,0.0,0.0,,2013.0,29,CIÊNCIA DA COMPUTAÇÃO,UFBA
2,2015,578,1.0,1.0,13277.0,,4.0,1.0,1.0,1.0,...,,1.0,0.0,0.0,0.0,,2013.0,29,CIÊNCIA DA COMPUTAÇÃO,UFBA
3,2015,578,1.0,1.0,13277.0,,4.0,1.0,1.0,1.0,...,,1.0,0.0,0.0,0.0,,2013.0,29,CIÊNCIA DA COMPUTAÇÃO,UFBA
4,2015,578,1.0,1.0,13277.0,,4.0,1.0,1.0,1.0,...,,1.0,0.0,0.0,0.0,,2013.0,29,CIÊNCIA DA COMPUTAÇÃO,UFBA


In [122]:
replace_map_cor_raca = {
    'TP_COR_RACA':{
        0: 'Não quis declarar',  
        1: 'Branca',
        2: 'Preta',
        3: 'Parda',
        4: 'Amarela',
        5: 'Indígena',
        9: 'Sem informação'
    }
}

In [123]:
replace_map_sexo = {
    'TP_SEXO':{
        1: 'Feminino',
        2: 'Masculino'
    }
}

In [186]:
replace_map_deficiencia = {
    'IN_DEFICIENCIA':{
        0.0: 'Normal',  
        1.0: 'Deficiente',
        9.0: 'Sem Deficiência'
    }
}

In [184]:
data_frame_diversidade.replace(replace_map_cor_raca, inplace=True)

In [185]:
data_frame_diversidade.replace(replace_map_sexo, inplace=True)

In [187]:
data_frame_diversidade.replace(replace_map_deficiencia, inplace=True)

In [222]:
def calculo_desvio_padrao(coluna, dataframe):
    return round(dataframe[coluna].std(), 4)

In [127]:
def min_max_scaler(dataframe, column_name):
    min_param = dataframe[column_name].min()
    max_param = dataframe[column_name].max()
    return (dataframe[column_name] - min_param) / (max_param- min_param)
        

In [147]:
def calcula_diversidade_global(dataframe_execucao, coluna_principal, coluna_quantidade, is_reserva = False):
    
    std_geral = {'IFBA': [], 'UFBA': [], 'UEFS': [], 'UNEB': [], 'IFBAIANO': [], 'UESB': [], 'UNIVASF': [], 'UESC': []}
    if (is_reserva):
        dataframe_temp = dataframe_execucao
    else:
        dataframe_temp = dataframe_execucao.groupby(['SG_IES', 'NU_ANO_CENSO'])
        dataframe_temp = dataframe_temp[coluna_principal].value_counts().to_frame(name = coluna_quantidade).reset_index()
    for ano in ANOS:
        current_dataframe = dataframe_temp[dataframe_temp['NU_ANO_CENSO'] == ano]
        current_dataframe[coluna_quantidade] = min_max_scaler(current_dataframe, coluna_quantidade)
        for universidade in UNIVERSIDADES:        
            current_dataframe_temp = current_dataframe[current_dataframe['SG_IES'] == universidade] 
            std_result = calculo_desvio_padrao(coluna_quantidade, current_dataframe_temp)    
            std_geral[universidade].append(std_result)
    return std_geral

In [138]:
def construcao_data_frame_reserva(df, colunas):
    universidades, anos, categoria, valor, qt_abs = [], [], [], [], []
    for uni in df.SG_IES.unique():
        for ano in range(2015, 2020):
            df_uni = df.loc[(df.SG_IES == uni) & (df.NU_ANO_CENSO == ano)]
            for i in range(len(colunas)):
                anos.append(ano)
                universidades.append(uni)
                df_uni_cat = df_uni.loc[df_uni[colunas[i]] == 1]
                valor.append(colunas[i])         
                valor_abs = df_uni_cat.groupby([colunas[i]]).size().values.tolist()
                if len(valor_abs) > 0:
                    qt_abs.append(sum(valor_abs))
                    total = df_uni.groupby(['NU_ANO_CENSO']).size().values.tolist()                
                else:
                    qt_abs.append(0)
                  

    return pd.DataFrame({'SG_IES':universidades, 'NU_ANO_CENSO':anos, 'RESERVA': valor,
                                     'RESERVA_QUANTIDADE':qt_abs}) 


## Análise da dimensão Gênero

In [211]:
print('\n\nDiversidade de Gênero')
std_genero = calcula_diversidade_global(data_frame_diversidade, 'TP_SEXO', 'TP_SEXO_QUANTIDADE' )
display(pd.DataFrame.from_dict(std_genero).set_index([pd.Index([2015, 2016, 2017, 2018, 2019])]))



Diversidade de Gênero


Unnamed: 0,IFBA,UFBA,UEFS,UNEB,IFBAIANO,UESB,UNIVASF,UESC
2015,0.3588,0.592,0.276,0.1094,0.1009,0.1475,0.1675,0.1485
2016,0.3639,0.5852,0.2404,0.1243,0.1036,0.1401,0.1517,0.121
2017,0.4328,0.5922,0.25,0.1448,0.0947,0.1311,0.1311,0.1238
2018,0.4521,0.5989,0.2372,0.1371,0.0934,0.1282,0.1342,0.1127
2019,0.495,0.6094,0.2121,0.107,0.109,0.1204,0.1298,0.0916


## Análise da dimensão Cor e Raça


In [98]:
data_frame_diversidade['TP_COR_RACA'].value_counts()

Não quis declarar    8399
Parda                5518
Preta                2654
Branca               1615
Amarela               107
Indígena               66
Sem informação         55
Name: TP_COR_RACA, dtype: int64

In [133]:
dataframe_temp = data_frame_diversidade[(data_frame_diversidade['TP_COR_RACA'] != 'Não quis declarar') & (data_frame_diversidade['TP_COR_RACA'] != 'Sem informação')]
dataframe_temp['TP_COR_RACA'].value_counts()

Parda       5518
Preta       2654
Branca      1615
Amarela      107
Indígena      66
Name: TP_COR_RACA, dtype: int64

In [134]:
std_cor_raca = calcula_diversidade_global(data_frame_diversidade, 'TP_COR_RACA', 'TP_COR_RACA_QUANTIDADE' )

In [212]:
print('\n\nDiversidade de Cor e Raça')
display(pd.DataFrame.from_dict(std_cor_raca).set_index([pd.Index([2015, 2016, 2017, 2018, 2019])]))



Diversidade de Cor e Raça


Unnamed: 0,IFBA,UFBA,UEFS,UNEB,IFBAIANO,UESB,UNIVASF,UESC
2015,0.3991,0.4176,0.3107,0.0522,0.055,0.073,0.0661,0.1489
2016,0.3818,0.3804,0.2773,0.075,0.0622,0.0537,0.0581,0.1202
2017,0.3891,0.4076,0.3134,0.1003,0.0701,0.0542,0.0725,0.1088
2018,0.3854,0.4276,0.3063,0.1028,0.086,0.0651,0.085,0.0751
2019,0.3784,0.4251,0.262,0.0771,0.1126,0.076,0.0938,0.0485


## Análise da dimensão Reserva de Vaga


In [217]:
colunas_reserva = ['IN_RESERVA_ETNICO', 'IN_RESERVA_DEFICIENCIA', 
  'IN_RESERVA_ENSINO_PUBLICO', 'IN_RESERVA_RENDA_FAMILIAR',
  'IN_RESERVA_OUTRA']
df_reserva_vagas = construcao_data_frame_reserva(data_frame_diversidade, colunas_reserva)


In [218]:
std_reserva = calcula_diversidade_global(df_reserva_vagas, 'RESERVA', 'RESERVA_QUANTIDADE', True)

In [220]:
print('\n\nDiversidade de Reserva de Vaga')
display(pd.DataFrame.from_dict(std_reserva).set_index([pd.Index([2015, 2016, 2017, 2018, 2019])]))



Diversidade de Reserva de Vaga


Unnamed: 0,IFBA,UFBA,UEFS,UNEB,IFBAIANO,UESB,UNIVASF,UESC
2015,0.431,0.3471,0.3304,0.1562,0.0732,0.0,0.0325,0.0401
2016,0.4358,0.3207,0.2819,0.0431,0.0789,0.0,0.0588,0.0261
2017,0.3861,0.497,0.3079,0.0861,0.0763,0.0,0.0761,0.0
2018,0.3713,0.4914,0.1347,0.0995,0.0768,0.0,0.088,0.0732
2019,0.4154,0.4863,0.0,0.0901,0.0939,0.0,0.0972,0.0


## Análise da dimensão Deficiência


In [188]:
data_frame_diversidade['IN_DEFICIENCIA'].value_counts()

Normal             14343
Sem Deficiência     3926
Deficiente           145
Name: IN_DEFICIENCIA, dtype: int64

In [190]:
dataframe_temp = data_frame_diversidade[(data_frame_diversidade['IN_DEFICIENCIA'] != 'Sem Deficiência')]
dataframe_temp['IN_DEFICIENCIA'].value_counts()

Normal        14343
Deficiente      145
Name: IN_DEFICIENCIA, dtype: int64

In [191]:
std_deficiencia = calcula_diversidade_global(dataframe_temp, 'IN_DEFICIENCIA', 'IN_DEFICIENCIA_QUANTIDADE' )

In [214]:
print('\n\nDiversidade de Deficiências')
display(pd.DataFrame.from_dict(std_deficiencia).set_index([pd.Index([2015, 2016, 2017, 2018, 2019])]))



Diversidade de Deficiências


Unnamed: 0,IFBA,UFBA,UEFS,UNEB,IFBAIANO,UESB,UNIVASF,UESC
2015,0.1854,0.7022,,0.2131,0.1445,0.1739,,
2016,0.1026,0.7028,,0.2116,0.1575,0.1575,,0.1307
2017,0.1622,0.7028,,0.2197,0.1443,0.15,,0.1393
2018,0.2266,0.7031,,0.2015,0.1378,0.1484,,0.1325
2019,0.3684,0.7041,0.2426,0.1573,0.1421,0.1446,,0.1119


## Análise da dimensão Faixa etária

In [163]:
display(data_frame_diversidade['NU_IDADE'].isnull().sum())

0

In [164]:
display(data_frame_diversidade['NU_IDADE'].isna().sum())

0

In [166]:
std_idade = {'IFBA': [], 'UFBA': [], 'UEFS': [], 'UNEB': [], 'IFBAIANO': [], 'UESB': [], 'UNIVASF': [], 'UESC': []}

for ano in ANOS:
    current_dataframe = data_frame_diversidade[data_frame_diversidade['NU_ANO_CENSO'] == ano]
    for universidade in UNIVERSIDADES:        
        current_dataframe_temp = current_dataframe[current_dataframe['SG_IES'] == universidade] 
        std_result = calculo_desvio_padrao('NU_IDADE', current_dataframe_temp)    
        std_idade[universidade].append(std_result)


In [215]:
print('\n\nDiversidade de Faixa Etária')
display(pd.DataFrame.from_dict(std_idade).set_index([pd.Index([2015, 2016, 2017, 2018, 2019])]))



Diversidade de Faixa Etária


Unnamed: 0,IFBA,UFBA,UEFS,UNEB,IFBAIANO,UESB,UNIVASF,UESC
2015,7.1478,5.9189,3.5537,5.7063,7.115,4.1488,4.7406,4.7779
2016,7.2188,6.149,3.3543,5.4729,6.9846,4.1811,4.7514,3.931
2017,7.5999,6.6709,3.6481,5.8396,6.2391,3.9899,4.8307,3.7883
2018,7.4512,6.9949,3.7665,6.0166,6.1929,4.2479,4.5633,4.41
2019,6.8866,7.1936,3.8909,5.8875,5.4825,3.897,4.9169,3.7824
