After exploring the data on the small database it is time to check if it applies to the main database and adjust it accordingly.

In [1]:
import pandas as pd
import numpy as np

#Copying the functions used previously
def apply_median(df,columns):
    '''takes in a list of columns from a dataframe (df) and applies the median of that column where the value is a NaN'''
    for col in columns:
        df.loc[:,col] = df.loc[:,col].fillna(df.loc[:,col].median())
    return df;


def create_dummy_df(df, cat_cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. removes all the original columns in cat_cols
            3. dummy columns for each of the categorical columns in cat_cols
            4. if dummy_na is True - it also contains dummy columns for the NaN values
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
    for col in  cat_cols:
        try:
            #for each cat add dummy var, drop original column
            df = pd.concat([df.drop(columns=col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df;


In [2]:
def transform_data(df):
    '''
    Input
    A portfolio type dataframe
    
    Output
    Pandas dataframe with no string type variables and no NaN values
    '''
    
    #eliminating all columns with less than 50% of the values
    less50_cols = set(df.loc[:, df.isna().mean() > .5])
    df = df.drop(columns=less50_cols)
    
    #list of columns to be dropped
    columns_drop = ['qt_socios_pf','qt_socios_pj', 'fl_matriz','natureza_juridica_macro',
                    'de_natureza_juridica','de_ramo','idade_emp_cat', 'dt_situacao','fl_st_especial',
                    'fl_email','fl_telefone','nm_segmento','fl_optante_simples','nm_micro_regiao','sg_uf_matriz',
                    'fl_optante_simei','vl_faturamento_estimado_aux','vl_faturamento_estimado_grupo_aux',
                    'nu_meses_rescencia','empsetorcensitariofaixarendapopulacao']
    
    #dropping the columns from the list of columns to be dropped
    df = df.drop(columns=columns_drop)
    
    #quick function to transform True into 1 and False into 0
    transform_boolean = lambda col: 1 if col == True else 0
    
    #list of columns to apply the funcion above
    boolean_cols = ['fl_me','fl_sa','fl_epp','fl_mei','fl_ltda','fl_rm','fl_spa',
                    'fl_antt','fl_veiculo','fl_simples_irregular','fl_passivel_iss']
    
    #applying the transform boolean function to the list of columns above
    for col in boolean_cols:
        df[col] = df.loc[:,col].apply(transform_boolean)
    
    #quick function to transform having vehicles into 1 and not having vehicles into 0
    transform_vehicles = lambda col: 0 if col == 0 else 1
    
    #list of vehicles columns
    vehicle_cols = ['vl_total_veiculos_leves_grupo','vl_total_veiculos_pesados_grupo']
    
    #applying the transform_vehicles function to the above list
    for col in vehicle_cols:
        df[col] = df.loc[:,col].apply(transform_vehicles)
    
    #list of columns to fill the NaN values with the median
    median_cols = ['idade_media_socios','idade_maxima_socios','idade_minima_socios','qt_socios','qt_socios_st_regular']
    
    #filling the NaN values with the median of the column in the list above
    apply_median(df,median_cols)
    
    #rounding to 2 decimal cases the idade_empresa_anos feature
    df.loc[:,'idade_empresa_anos'] = df.loc[:,'idade_empresa_anos'].round(decimals=2)
    
    #replacing the no information string with a NaN value in the de_faixa_faturamento_estimado feature
    df.loc[:,'de_faixa_faturamento_estimado'] = df.loc[:,'de_faixa_faturamento_estimado'].replace('SEM INFORMACAO', np.nan)
    
    #filling the NaN values of the column with the No information label
    df['de_saude_rescencia'] = df['de_saude_rescencia'].fillna('SEM INFORMACAO')
    
    #list of categorical columns to transform into dummy type columns with NaN as a feature
    dummy_cols_NA_True = ['sg_uf','setor','nm_divisao','de_saude_tributaria','de_nivel_atividade','nm_meso_regiao',
                          'de_faixa_faturamento_estimado','de_faixa_faturamento_estimado_grupo','de_saude_rescencia']
    
    #transforming the list of columns above into dummy type columns with NaN as a feature
    df = create_dummy_df(df,dummy_cols_NA_True,True)
    
    
    return df;

In [3]:
def load_data(df,filepath):
    '''
    Input
    pandas dataframe and the filepath to save the dataframe
    Output
    File with the pandas dataframe
    '''
    df.to_csv(filepath, index = False)

In [4]:
filepath = 'data/estaticos_market.csv'

def extract_data(filepath):
    '''
    Input
    filepath - string of the file path where the csv is located
    Output - pandas dataframe
    '''
    # read the large csv file with specified chunksize 
    df_chunk = pd.read_csv(filepath, chunksize=10000, index_col=0)
    
    chunk_list = []  # append each chunk df here 
    
    # Each chunk is in df format
    for chunk in df_chunk:  
        # perform data filtering 
        chunk_filter = transform_data(chunk)
        
        # Once the data filtering is done, append the chunk to list
        chunk_list.append(chunk_filter)
        
        
    # concat the list into dataframe 
    df_concat = pd.concat(chunk_list)
    
    return df_concat;

market = extract_data(filepath)
market.shape #we know that our database has 462298 so this number serves to check if the import is working.

(462298, 175)

In [5]:
market.head()

Unnamed: 0,id,idade_empresa_anos,fl_me,fl_sa,fl_epp,fl_mei,fl_ltda,fl_rm,fl_spa,fl_antt,...,nm_divisao_ATIVIDADES DE APOIO A EXTRACAO DE MINERAIS,nm_divisao_DESCONTAMINACAO E OUTROS SERVICOS DE GESTAO DE RESIDUOS,nm_divisao_EXTRACAO DE PETROLEO E GAS NATURAL,nm_divisao_FABRICACAO DE PRODUTOS FARMOQUIMICOS E FARMACEUTICOS,"de_faixa_faturamento_estimado_ATE R$ 81.000,00","de_faixa_faturamento_estimado_DE R$ 500.000.000,01 A 1 BILHAO DE REAIS",de_saude_rescencia_ATE 3 MESES,nm_divisao_EXTRACAO DE CARVAO MINERAL,de_saude_rescencia_ATE 6 MESES,nm_divisao_FABRICACAO DE PRODUTOS DO FUMO
0,a6984c3ae395090e3bee8ad63c3758b110de096d5d8195...,14.46,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1,6178f41ade1365e44bc2c46654c2c8c0eaae27dcb476c4...,1.46,0,0,0,1,0,0,0,0,...,,,,,,,,,,
2,4a7e5069a397f12fdd7fd57111d6dc5d3ba558958efc02...,7.09,0,0,0,1,0,0,0,0,...,,,,,,,,,,
3,3348900fe63216a439d2e5238c79ddd46ede454df7b9d8...,6.51,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4,1f9bcabc9d3173c1fe769899e4fac14b053037b953a1e4...,3.2,0,0,0,0,0,0,0,0,...,,,,,,,,,,


We can see that there are NaNs in the database, so we'll explore what went wrong. First let's take a look at the labels that were generated and then we'll see which ones have NaN values.

In [6]:
list(market.columns)

['id',
 'idade_empresa_anos',
 'fl_me',
 'fl_sa',
 'fl_epp',
 'fl_mei',
 'fl_ltda',
 'fl_rm',
 'fl_spa',
 'fl_antt',
 'fl_veiculo',
 'vl_total_veiculos_pesados_grupo',
 'vl_total_veiculos_leves_grupo',
 'fl_simples_irregular',
 'fl_passivel_iss',
 'qt_socios',
 'idade_media_socios',
 'idade_maxima_socios',
 'idade_minima_socios',
 'qt_socios_st_regular',
 'qt_filiais',
 'sg_uf_AM',
 'sg_uf_MA',
 'sg_uf_PI',
 'sg_uf_RN',
 'sg_uf_RO',
 'sg_uf_nan',
 'setor_COMERCIO',
 'setor_CONSTRUÇÃO CIVIL',
 'setor_INDUSTRIA',
 'setor_SERVIÇO',
 'setor_nan',
 'nm_divisao_AGENCIAS DE VIAGENS OPERADORES TURISTICOS E SERVICOS DE RESERVAS',
 'nm_divisao_AGRICULTURA PECUARIA E SERVICOS RELACIONADOS',
 'nm_divisao_ALIMENTACAO',
 'nm_divisao_ALOJAMENTO',
 'nm_divisao_ALUGUEIS NAO IMOBILIARIOS E GESTAO DE ATIVOS INTANGIVEIS NAO FINANCEIROS',
 'nm_divisao_ARMAZENAMENTO E ATIVIDADES AUXILIARES DOS TRANSPORTES',
 'nm_divisao_ATIVIDADES ARTISTICAS CRIATIVAS E DE ESPETACULOS',
 'nm_divisao_ATIVIDADES AUXILIARES DO

In [7]:
set(market.loc[:, market.isna().mean() > .5])

{'de_faixa_faturamento_estimado_ATE R$ 81.000,00',
 'de_faixa_faturamento_estimado_DE R$ 500.000.000,01 A 1 BILHAO DE REAIS',
 'de_saude_rescencia_ATE 3 MESES',
 'de_saude_rescencia_ATE 6 MESES',
 'nm_divisao_DESCONTAMINACAO E OUTROS SERVICOS DE GESTAO DE RESIDUOS',
 'nm_divisao_EXTRACAO DE CARVAO MINERAL',
 'nm_divisao_FABRICACAO DE PRODUTOS DO FUMO',
 'nm_divisao_FABRICACAO DE PRODUTOS FARMOQUIMICOS E FARMACEUTICOS',
 'nm_divisao_ORGANISMOS INTERNACIONAIS E OUTRAS INSTITUICOES EXTRATERRITORIAIS'}

In [8]:
market.isna().mean().sort_values(ascending=False)[0:50]

nm_divisao_EXTRACAO DE CARVAO MINERAL                                                                0.978369
nm_divisao_FABRICACAO DE PRODUTOS DO FUMO                                                            0.956738
nm_divisao_ORGANISMOS INTERNACIONAIS E OUTRAS INSTITUICOES EXTRATERRITORIAIS                         0.843612
de_saude_rescencia_ATE 3 MESES                                                                       0.805320
nm_divisao_DESCONTAMINACAO E OUTROS SERVICOS DE GESTAO DE RESIDUOS                                   0.762058
de_saude_rescencia_ATE 6 MESES                                                                       0.740427
nm_divisao_FABRICACAO DE PRODUTOS FARMOQUIMICOS E FARMACEUTICOS                                      0.653903
de_faixa_faturamento_estimado_ATE R$ 81.000,00                                                       0.589010
de_faixa_faturamento_estimado_DE R$ 500.000.000,01 A 1 BILHAO DE REAIS                               0.567379
nm_divisao

In [9]:
market['nm_divisao_EXTRACAO DE CARVAO MINERAL'].value_counts()

0.0    9999
1.0       1
Name: nm_divisao_EXTRACAO DE CARVAO MINERAL, dtype: int64

In [10]:
market['nm_divisao_DESCONTAMINACAO E OUTROS SERVICOS DE GESTAO DE RESIDUOS'].value_counts()

0.0    109988
1.0        12
Name: nm_divisao_DESCONTAMINACAO E OUTROS SERVICOS DE GESTAO DE RESIDUOS, dtype: int64

In [11]:
market['nm_divisao_ATIVIDADES DE PRESTACAO DE SERVICOS DE INFORMACAO'].value_counts()

0    461549
1       749
Name: nm_divisao_ATIVIDADES DE PRESTACAO DE SERVICOS DE INFORMACAO, dtype: int64

The final verification is if we are only left with categorical type columns with nan values and the list above tell us this is true, meaning they are appearing due to the concatenate after the transform data function. So all there is left is to replace these NaNs with zero

In [12]:
market = market.fillna(0)

In [14]:
market.isna().mean().sort_values(ascending=False)

nm_divisao_FABRICACAO DE PRODUTOS DO FUMO                           0.0
nm_divisao_ATIVIDADES LIGADAS AO PATRIMONIO CULTURAL E AMBIENTAL    0.0
nm_divisao_CONSTRUCAO DE EDIFICIOS                                  0.0
nm_divisao_CONFECCAO DE ARTIGOS DO VESTUARIO E ACESSORIOS           0.0
nm_divisao_COMERCIO VAREJISTA                                       0.0
                                                                   ... 
de_saude_tributaria_CINZA                                           0.0
de_saude_tributaria_AZUL                                            0.0
nm_divisao_nan                                                      0.0
nm_divisao_TRANSPORTE TERRESTRE                                     0.0
id                                                                  0.0
Length: 175, dtype: float64

In the next notebook we'll add this to the transform function and the ETL.