In [1]:
import os
import sys
import pandas as pd
import numpy as np
import plotly.express as px
from scipy import stats


PROJECT_NAME = 'UFAL-DS'
curdir = os.path.abspath(os.path.curdir).split('/')
project_index = curdir.index(PROJECT_NAME)
os.chdir('/' + os.path.join(*curdir[:project_index + 1]))

In [15]:
aposentados = pd.read_csv("data/processed/aposentados.csv")
dataset = aposentados.drop_duplicates(subset=['name', 'cpf'], keep='first')
dataset.shape

(415950, 19)

## Hypotesis
* H0 the retirement value for men and women are equal
* HA are different

In [16]:
men = dataset.loc[dataset.gender == "M", "value"].to_numpy()
mn = men.shape[0]
print("Nº men:", mn)

women = dataset.loc[dataset.gender == "F", "value"].to_numpy()
wn = women.shape[0]
print("Nº women:", wn)

Nº men: 189145
Nº women: 215542


In [17]:
dataset.loc[dataset.gender == "M", "value"].describe()

count    189145.000000
mean       6921.636250
std        5052.191104
min           0.010000
25%        3552.220000
50%        5098.450000
75%        8579.640000
max      106853.170000
Name: value, dtype: float64

In [18]:
dataset.loc[dataset.gender == "F", "value"].describe()

count    215542.000000
mean       6002.055352
std        4255.832307
min           0.010000
25%        3230.867500
50%        4619.670000
75%        7091.075000
max       77048.590000
Name: value, dtype: float64

In [19]:
t, p = stats.ttest_ind(men, women, equal_var=False)
print("t = " + str(t))
print("p = " + str(p))

t = 62.142643046731465
p = 0.0


## Profissional level

In [23]:
max(dataset["level"].unique())

801

In [33]:
men = dataset.loc[(dataset.gender == "M") & (dataset.level == 801), "value"].to_numpy()
mn = men.shape[0]

women = dataset.loc[(dataset.gender == "F") & (dataset.level == 801), "value"].to_numpy()
wn = women.shape[0]

t, p = stats.ttest_ind(men, women, equal_var=False)
print("t = " + str(t))
print("p = " + str(p))

t = -0.31748355234366427
p = 0.7508898625365367


In [34]:
men.mean()

15063.986239729338

In [35]:
women.mean()

15103.84823583181

## Region

In [36]:
aposentados = pd.read_csv("data/processed/aposentados.csv")
abono = pd.read_csv("data/processed/abono.csv")

aposentados = aposentados.drop_duplicates(subset=['name', 'cpf'], keep='first')
abono = abono.drop_duplicates(subset=['name', 'cpf'], keep='first')

aposentados["retirement_year"] = pd.to_datetime(aposentados.retirement_date).dt.year

use_cols = [
    'registration_n', 'agency_acronym', 'superior_agency_cod', 
    'classes', 'pattern', 'ref', 'level', 'retirement_type', 
    'legal_substantiation', 'doc_title', 'retirement_date',
    'admission_type', 'admission_date', 'value'
]
data = abono.merge(aposentados[['name', 'cpf'] + use_cols], left_on=['name', 'cpf'], right_on=['name', 'cpf'])

data.shape

(22213, 29)

In [37]:
dataset = data
dataset.loc[dataset.upag_state == "pi", "region"] = "nordeste"
dataset.loc[dataset.upag_state == "ma", "region"] = "nordeste"
dataset.loc[dataset.upag_state == "ce", "region"] = "nordeste"
dataset.loc[dataset.upag_state == "rn", "region"] = "nordeste"
dataset.loc[dataset.upag_state == "pb", "region"] = "nordeste"
dataset.loc[dataset.upag_state == "pe", "region"] = "nordeste"
dataset.loc[dataset.upag_state == "al", "region"] = "nordeste"
dataset.loc[dataset.upag_state == "se", "region"] = "nordeste"
dataset.loc[dataset.upag_state == "ba", "region"] = "nordeste"

dataset.loc[dataset.upag_state == "es", "region"] = "sudeste"
dataset.loc[dataset.upag_state == "rj", "region"] = "sudeste"
dataset.loc[dataset.upag_state == "sp", "region"] = "sudeste"
dataset.loc[dataset.upag_state == "mg", "region"] = "sudeste"

dataset.loc[dataset.upag_state == "ac", "region"] = "norte"
dataset.loc[dataset.upag_state == "am", "region"] = "norte"
dataset.loc[dataset.upag_state == "ro", "region"] = "norte"
dataset.loc[dataset.upag_state == "rr", "region"] = "norte"
dataset.loc[dataset.upag_state == "am", "region"] = "norte"
dataset.loc[dataset.upag_state == "pa", "region"] = "norte"
dataset.loc[dataset.upag_state == "to", "region"] = "norte"

dataset.loc[dataset.upag_state == "df", "region"] = "centro-oeste"
dataset.loc[dataset.upag_state == "go", "region"] = "centro-oeste"
dataset.loc[dataset.upag_state == "mt", "region"] = "centro-oeste"
dataset.loc[dataset.upag_state == "ms", "region"] = "centro-oeste"

dataset.loc[dataset.upag_state == "pr", "region"] = "sul"
dataset.loc[dataset.upag_state == "rs", "region"] = "sul"
dataset.loc[dataset.upag_state == "sc", "region"] = "sul"

In [38]:
for region in ["nordeste", "sudeste", "norte", "centro-oeste", "sul"]:
    men = dataset.loc[(dataset.gender == "M") & (dataset.region == region), "value_y"].to_numpy()
    mn = men.shape[0]

    women = dataset.loc[(dataset.gender == "F") & (dataset.region == region), "value_y"].to_numpy()
    wn = women.shape[0]
    
    t, p = stats.ttest_ind(men, women, equal_var=False)
    print(region)
    print("t = " + str(t))
    print("p = " + str(p))

nordeste
t = -5.057939757322786
p = 4.369759664256538e-07
sudeste
t = 3.0442103869883086
p = 0.0023412555385567075
norte
t = 3.534136771043425
p = 0.00041810071196586314
centro-oeste
t = 1.4734028949038966
p = 0.1407255989157796
sul
t = 2.9098284220500097
p = 0.003669048508179476


In [51]:
dataset.loc[(dataset.gender == "M") & (dataset.region == "nordeste"), "value_y"].mean()

5761.060670897551

In [52]:
dataset.loc[(dataset.gender == "F") & (dataset.region == "nordeste"), "value_y"].mean()

6341.348445769662

## Educational level 

In [58]:
level = [
    'doutorado                                    ',
    'mestrado                                     '
]
men = abono.loc[(abono.gender == "M") & (abono.educational_level.isin(level)), "value"].to_numpy()
mn = men.shape[0]

women = abono.loc[(abono.gender == "F") & (abono.educational_level.isin(level)), "value"].to_numpy()
wn = women.shape[0]

t, p = stats.ttest_ind(men, women, equal_var=False)
print("t = " + str(t))
print("p = " + str(p))

t = 3.4515111954735755
p = 0.0005684066830451799


In [46]:
dataset.educational_level.unique()

array(['ensino medio                                 ',
       'ensino superior                              ',
       'ensino fundamental                           ',
       'segundo grau incompleto                      ',
       'ensino fundamental incompleto                ',
       'mestrado                                     ',
       'doutorado                                    ',
       'alfabetizado sem cursos regulares            ',
       'superior incompleto                          ',
       '4a. serie do primeiro grau completa          ',
       'primeiro grau incomp.-ate a 4a.serie incomp. '], dtype=object)

In [60]:
women.mean()

1928.3729773269688

In [49]:
abono.loc[abono.educational_level == 'doutorado                                    '].shape

(1597, 15)