<a href="https://colab.research.google.com/github/Pugianf/Big_Data_and_Public_Sector_I/blob/main/Atividade_1_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
# importando bibliotecas necessárias

import numpy as np
import pandas as pd
import statsmodels.api as sm
from zipfile import ZipFile
from scipy import stats

In [3]:
# conectando ao google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Definindo a pasta de leitura do arquivo
sCaminho = '/content/drive/MyDrive/IDP/Dataset/'

## Lendo os arquivos zipados
sArquivo = f"{sCaminho}PNADC_042019.zip"
with ZipFile(sArquivo) as z:
    df = pd.read_csv(z.open("PNADC_042019.csv"))
    print(*z.namelist(),sep="\n")

PNADC_042019.csv


In [5]:
# retirando a coluna 'unamed: 0'

df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
df

Unnamed: 0,Ano,Trimestre,UF,UPA,Estrato,V1008,V1014,V1022,V1027,V1028,V1029,posest,V2001,V2003,V2005,V2007,V2009,V2010,VD2002,VD2003,VD3004,VD3005,VD3006,VD4001,VD4002,VD4003,VD4005,VD4008,VD4009,VD4010,VD4016,VD4017,VD4019,VD4020,VD4031,VD4035,VD4036,VD4037
0,2019,4,11,110000016,1110011,1,7,1,98.756636,150.693106,532471,111,4,1,1,2,45,4,1,4,5,12,5,1,1,,,1,1,9,2100.0,2100.0,2100.0,2100.0,36,30,2,2
1,2019,4,11,110000016,1110011,3,7,1,98.756636,150.693106,532471,111,5,2,2,2,54,4,2,5,5,12,5,1,1,,,3,7,9,1000.0,1000.0,1000.0,1000.0,44,38,3,2
2,2019,4,11,110000016,1110011,4,7,1,98.756636,150.693106,532471,111,2,1,1,1,24,1,1,2,5,12,5,1,1,,,5,9,6,2500.0,2500.0,2500.0,2500.0,48,42,4,3
3,2019,4,11,110000016,1110011,4,7,1,98.756636,150.693106,532471,111,2,2,12,2,21,4,10,2,5,12,5,1,1,,,1,2,6,1000.0,1000.0,1000.0,1000.0,48,42,4,3
4,2019,4,11,110000016,1110011,5,7,1,98.756636,150.693106,532471,111,5,3,4,1,30,1,3,5,5,12,5,1,1,,,1,1,4,1600.0,1600.0,1600.0,1600.0,44,38,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214284,2019,4,53,530051067,5310211,6,7,1,132.055124,147.638607,3027352,531,3,2,2,1,59,1,2,3,2,5,3,1,1,,,1,1,2,1600.0,1600.0,1600.0,1600.0,44,44,3,3
214285,2019,4,53,530051067,5310211,7,7,1,132.055124,147.638607,3027352,531,2,2,2,2,43,4,2,2,5,12,5,1,1,,,1,1,7,1400.0,1400.0,1400.0,1400.0,40,40,3,3
214286,2019,4,53,530051067,5310211,8,7,1,132.055124,147.638607,3027352,531,1,1,1,2,51,2,1,1,2,5,3,1,1,,,5,9,6,400.0,400.0,400.0,400.0,30,30,2,2
214287,2019,4,53,530051067,5310211,12,7,1,132.055124,147.638607,3027352,531,3,1,1,2,42,1,1,3,3,9,4,1,1,,,1,2,7,1700.0,1700.0,1700.0,1700.0,44,44,3,3


In [7]:
# os valores nulos de renda (VD4020) não são pessoas que ganham 0, são pessoas que NÃO TRABALHAM

df.dropna(subset=['VD4020'], inplace=True)

# retirando pessoas em idade não-ativa

df = df.loc[(df["V2009"] >= 15) & (df["V2009"] <= 65)]

In [8]:
# verificando o formato da base filtrada

df.shape

(214289, 38)

In [9]:
# criando a coluna de identificação dos domicilios

df['iddom'] = df['UPA'].astype(str) + df['V1008'].astype(str) + df['V1014'].astype(str)

df['idind'] = df['iddom'] + df['V2003'].astype(str)

In [10]:
# criando idade e idade ao quadrado

df.rename(columns={"V2009":"idade"}, inplace=True)

df['idadesq'] = df['idade'] ** 2

In [12]:
# criando a dummy feminina

df['feminino'] = df['V2007'] - 1

In [13]:
# substituindo os números pelas coress e vendo a quantidade

df["V2010"].replace([1,2,3,4,5,9], ['branca','preta','amarela','parda','indigena',np.nan], inplace=True)

df['V2010'].value_counts(dropna=False)

parda       104266
branca       86858
preta        21022
amarela       1168
indigena       937
NaN             38
Name: V2010, dtype: int64

In [14]:
# renomeando a coluna como cor

df.rename(columns={"V2010":"cor"}, inplace=True)

In [15]:
# criando as dummies e juntando-as ao dataframe

df = pd.concat([df, pd.get_dummies(df['cor'])], axis=1)

In [16]:
# verificando a dummy

df[['cor', 'branca','preta','amarela','parda','indigena']]

Unnamed: 0,cor,branca,preta,amarela,parda,indigena
0,parda,0,0,0,1,0
1,parda,0,0,0,1,0
2,branca,1,0,0,0,0
3,parda,0,0,0,1,0
4,branca,1,0,0,0,0
...,...,...,...,...,...,...
214284,branca,1,0,0,0,0
214285,parda,0,0,0,1,0
214286,preta,0,1,0,0,0
214287,branca,1,0,0,0,0


In [17]:
# substituindo os números de educação pelos nomes e verificando a quantidade

df["VD3004"].replace([1,2,3,4,5,6,7], ['sem_instrucao','fund_incompleto','fund_completo','medio_incompleto','medio_completo','superior_incompleto','superior_completo'], inplace= True)

df['VD3004'].value_counts(normalize=True, dropna=False)*100

medio_completo         32.885962
fund_incompleto        25.040483
superior_completo      18.809645
fund_completo           8.201074
medio_incompleto        6.794096
superior_incompleto     5.887843
sem_instrucao           2.380897
Name: VD3004, dtype: float64

In [18]:
# criando as dummies e juntando-as ao dataframe

df = pd.concat([df, pd.get_dummies(df['VD3004'])], axis = 1)

In [19]:
# V1022: domicilios rurais - alterando o nome e criando a dummy

df['rural'] = df['V1022'] - 1

In [20]:
# verificando proporções

df['rural'].value_counts(normalize=True, dropna=False)*100

0    78.682994
1    21.317006
Name: rural, dtype: float64

In [22]:
# VD4001: força de trabalho (mais especificamente, fora dela)

df['VD4001'] = df['VD4001'] - 1

# VD4002: ocupação (mais especificamente, pessoas desocupadas)

df['VD4002'] = df['VD4002'] - 1

In [23]:
# dummies de ocupaão para tipos de trabalho - mais agregado

df['VD4008'].replace([1,2,3,4,5,6], ['privado','domestico','publico','empregador','conta_propria','familiar'], inplace=True)

df = pd.concat([df,pd.get_dummies(df['VD4008'])], axis = 1)

df = df.drop(['conta_propria','familiar'], axis = 1)

# desagregado

df["VD4009"].replace([1,2,3,4,5,6,7,8,9,10], ['privado_formal','privado_informal','domestico_formal','domestico_informal','publico_formal','publico_informal','militar','empregador1','conta_propria','familiar'], inplace= True)

df = pd.concat([df,pd.get_dummies(df['VD4009'])], axis = 1)

# setor de ocupação

df['VD4010'] = df['VD4010'].replace([1,2,3,4,5,6,7,8,9,10,11,12], ['agro','industria','construcao','comercio','transporte','aloj_alim','servicos','adm_publica','educ_saude','outros_servicos','servicos_domesticos','ativ_mal_definidas'])

df = pd.concat([df,pd.get_dummies(df['VD4010'])], axis = 1)

In [24]:
# renomendo a coluna educação

df.rename(columns={"VD3005":"educ","VD3004":"grau_educ"}, inplace=True)

In [25]:
# renomeando as rendas

nomes_renda = {'VD4016':'renda_hab_prin','VD4017':'renda_efet_prin','VD4019':'renda_hab_tot','VD4020':'renda_efet_tot','VD4031':'horas_hab_tot','VD4032':'horas_efet_prin','VD4035':'horas_efet_tot'}

df.rename(columns = nomes_renda, inplace = True)

In [26]:
# vendo estatísticas das diferentes rendas e horas trabalhadas

df[['renda_hab_tot','renda_hab_prin','renda_efet_tot','renda_efet_prin','horas_hab_tot','horas_efet_tot']].describe()

Unnamed: 0,renda_hab_tot,renda_hab_prin,renda_efet_tot,renda_efet_prin,horas_hab_tot,horas_efet_tot
count,214289.0,214234.0,214289.0,214234.0,214289.0,214289.0
mean,2065.758723,1999.496032,2114.715151,2050.18653,39.499078,38.111
std,3140.350727,2926.786465,3353.240686,3156.096584,12.328975,13.942289
min,5.0,5.0,0.0,0.0,1.0,0.0
25%,998.0,998.0,950.0,900.0,36.0,30.0
50%,1300.0,1235.0,1300.0,1300.0,40.0,40.0
75%,2000.0,2000.0,2200.0,2100.0,44.0,44.0
max,300000.0,200000.0,300000.0,200000.0,120.0,120.0


In [27]:
# para controlar para a oferta de horas de trabalho (mulheres ofertam menos horas)

# dividimos pelo número de horas trabalhadas, o que é mais um motivo para usar rendas habituais

# (rendas efetivas possuem alguns 0, o que causaria problemas de divisão)

df['renda_hab_hora'] = df['renda_hab_tot']/(df['horas_hab_tot']*4)

df['lsalariohora'] = np.log(df['renda_hab_hora'])