This notebook is a preparation for the py file which will contain the ETL pipeline.
For information about the decision making process behind each feature please take a look at the Exploratory Analysis notebook.

In [2]:
import pandas as pd
import numpy as np


def load_data(filepath):
    '''
    Input
    filepath - string of the file path where the csv is located
    Output - pandas dataframe
    '''
    df = pd.read_csv(filepath,index_col=0)
    return df;
    
filepath = 'data/estaticos_portfolio1.csv'
df = load_data(filepath)

Copying the functions written to prepape the data

In [3]:
def apply_median(df,columns):
    '''takes in a list of columns from a dataframe (df) and applies the median of that column where the value is a NaN'''
    for col in columns:
        df.loc[:,col] = df.loc[:,col].fillna(df.loc[:,col].median())
    return df;


def create_dummy_df(df, cat_cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. removes all the original columns in cat_cols
            3. dummy columns for each of the categorical columns in cat_cols
            4. if dummy_na is True - it also contains dummy columns for the NaN values
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
    for col in  cat_cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(columns=col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df;



def categorical_months_to_years (df,column):
    '''
    INPUT:
    df - pandas dataframe
    column -  numerical column in months to be transformed into categorical columns with integer intervals in years.
    
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains new columns with the integer intervals found in the maximum years value.
            2. removes the original column.
            3. dummy columns for each of the intervals found within the maximum year value.
            4. Use a prefix of the column name with an underscore (_) for separating 
    '''
    
    
    months = 12 # number of months to divide
    
    #finding the maximum interval
    max_years = int(max(df.loc[:,column[0]].value_counts())/months)
    
    #creating the name for the maximum interval column
    max_column_list = column[0]+'_gt_'+str(max_years)#gt = greater than
    
    #setting the value to start the cycle
    years = max_years
    
    #creating a list of the years
    range_years = [max_years]
    
    #creating a list with the column names
    column_list = [max_column_list]
    
    #creating a list all the integer intervals within the maximum range and list of strings with the respective column names
    for i in range (0,max_years):
        if years > 1:
            years = years - 1
            range_years.append(years)
            column_list.append(column[0]+'_lt_'+str(years+1)+'_gt_'+str(years))
        else:
            range_years.append(0)
            column_list.append(column[0]+'_lt_'+str(years))#lt = less than

    #creating a temporary empty dataframe with the same number of rows as df and the column list
    temp_df = pd.DataFrame(index=range(0,df.shape[0]), columns=column_list)
    
    #array with the index of intervals where the df column value is in 
    intervals = np.digitize(df.loc[:,column[0]].values, bins=range_years)
    
    #filling the respective interval column with 1 and the remaning with 0
    for i in range(0, temp_df.shape[0]):
        index = intervals[i]
        temp_df.iloc[i,index] = 1
        temp_df.loc[i, temp_df.columns != column_list[index]]= 0
    
    #adding the new columns to the original dataframe and dropping the original column
    df = pd.concat([df.drop(columns=column, axis=1), temp_df], axis=1)
    
    return df;


def categorical_1k_wNaN (df,column):
    '''
    INPUT:
    df - pandas dataframe
    column -  numerical column to be transform into categorical columns
    
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains new columns with the 1000 intervals found in the maximum value.
            2. removes the original column.
            3. dummy columns for each of the intervals found within the maximum year value.
            4. 0,1 column where 1 represents the value being a NaN in the original column
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
    
    
    thousands = 1000 # number of thousands to divide
    
    #finding the maximum interval
    max_thousands = int(max(df.loc[:,column[0]])/thousands)
    
    max_interval = int(max(df.loc[:,column[0]])/thousands)*thousands
    
    #creating the name for the maximum interval column
    max_column_list = column[0]+'_gt_'+str(max_thousands)+'k'#gt = greater than
    
    #setting the value to start the cycle
    salary = max_interval
    
    #creating a list of the salary
    range_salary = [max_interval]
    
    #creating a list with the column names
    column_list = [max_column_list]
    
    #creating a list with all the thousands intervals within the maximum range and list of strings with the respective column names
    for i in range (0,max_thousands+2):#+2 because of the 500 interval between 0 and 1k and the NaN column
        if salary > 1000:
            salary = salary - 1000
            range_salary.append(salary)
            column_list.append(column[0]+'_lt_'+str(salary+1)+'k'+'_gt_'+str(salary)+'k')
        elif salary > 500:
            salary = salary - 500
            range_salary.append(salary)
            column_list.append(column[0]+'_lt_'+str(salary+0.5)+'k'+'_gt_'+str(salary)+'k')
        elif salary <= 500 and salary > 0:
            range_salary.append(0)
            column_list.append(column[0]+'_lt_'+str(salary))#lt = less than
            salary = salary - 500
        else:
            column_list.append(column[0]+'_NaN')

    #creating a temporary empty dataframe with the same number of rows as df and the column list
    temp_df = pd.DataFrame(index=range(0,df.shape[0]), columns=column_list)
    
    #array with the index of intervals created where the column value is in 
    intervals = np.digitize(df.loc[:,column[0]].values, bins=range_salary)
    
    #filling the NaN column in temp_df with 0,1 according to the presence of NaNs in the column
    for i in range(0, df.shape[0]):
        if pd.isnull(df.loc[i,column[0]]):
            temp_df.loc[i,column_list[-1]] = 1
            temp_df.loc[i, temp_df.columns != column_list[-1]]= 0
        else:
            temp_df.loc[i,column_list[-1]] = 0
    
    #for the non NaN columns filling the respective interval column with 1 and the remaning with 0
    for i in range(0, temp_df.shape[0]):
        if temp_df.loc[i,column_list[-1]] == 0:
            index = intervals[i]
            temp_df.iloc[i,index] = 1
            temp_df.loc[i, temp_df.columns != column_list[index]]= 0
    
    #adding the new columns to the original dataframe and dropping the original column
    df = pd.concat([df.drop(columns=column, axis=1), temp_df], axis=1)
    
    return df;

In [4]:
def clear_data(df):
    '''
    Input
    A portfolio type dataframe
    
    Output
    Pandas dataframe with no string type variables and no NaN values
    '''
    
    #eliminating all columns with less than 50% of the values
    less50_cols = set(df.loc[:, df.isna().mean() > .5])
    df = df.drop(columns=less50_cols)
    
    #list of columns to be dropped
    columns_drop = ['qt_socios_pf','qt_socios_pj', 'fl_matriz','natureza_juridica_macro',
                    'de_natureza_juridica','de_ramo','idade_emp_cat', 'dt_situacao','fl_st_especial',
                    'fl_email','fl_telefone','nm_segmento','fl_optante_simples','vl_faturamento_estimado_aux',
                    'vl_faturamento_estimado_grupo_aux']
    
    #dropping the columns from the list of columns to be dropped
    df = df.drop(columns=columns_drop)
    
    #quick function to transform True into 1 and False into 0
    transform_boolean = lambda col: 1 if col == True else 0
    
    #list of columns to apply the funcion above
    boolean_cols = ['fl_me','fl_sa','fl_epp','fl_mei','fl_ltda','fl_rm','fl_spa',
                    'fl_antt','fl_veiculo','fl_simples_irregular','fl_passivel_iss']
    
    #applying the transform boolean function to the list of columns above
    for col in boolean_cols:
        df[col] = df.loc[:,col].apply(transform_boolean)
    
    #quick function to transform having vehicles into 1 and not having vehicles into 0
    transform_vehicles = lambda col: 0 if col == 0 else 1
    
    #list of vehicles columns
    vehicle_cols = ['vl_total_veiculos_leves_grupo','vl_total_veiculos_pesados_grupo']
    
    #applying the transform_vehicles function to the above list
    for col in vehicle_cols:
        df[col] = df.loc[:,col].apply(transform_vehicles)
    
    #list of columns to fill the NaN values with the median
    median_cols = ['idade_media_socios','idade_maxima_socios','idade_minima_socios','qt_socios','qt_socios_st_regular']
    
    #filling the NaN values with the median of the column in the list above
    apply_median(df,median_cols)
    
    #rounding to 2 decimal cases the idade_empresa_anos feature
    df.loc[:,'idade_empresa_anos'] = df.loc[:,'idade_empresa_anos'].round(decimals=2)
    
    #replacing the no information string with a NaN value in the de_faixa_faturamento_estimado feature
    df.loc[:,'de_faixa_faturamento_estimado'] = df.loc[:,'de_faixa_faturamento_estimado'].replace('SEM INFORMACAO', np.nan)
    
    #list of categorical columns to transform into dummy type columns
    dummy_cols = ['sg_uf','setor','nm_divisao']
    
    #transforming the list of columns above into dummy type columns
    df = create_dummy_df(df,dummy_cols,False)
    
    #list of categorical columns to transform into dummy type columns with NaN as a feature
    dummy_cols_NA_True = ['de_saude_tributaria','de_nivel_atividade','nm_meso_regiao',
                          'de_faixa_faturamento_estimado','de_faixa_faturamento_estimado_grupo']
    
    #transforming the list of columns above into dummy type columns with NaN as a feature
    df = create_dummy_df(df,dummy_cols_NA_True,True)
    
    #Transforming the numerical columns below into categorical type features
    year_column=['nu_meses_rescencia']
    df = categorical_months_to_years(df,year_column)
    
    salary = ['empsetorcensitariofaixarendapopulacao']
    df = categorical_1k_wNaN(df,salary)
    
    return df;

df = clear_data(df)
df.head()

Unnamed: 0,id,idade_empresa_anos,fl_me,fl_sa,fl_epp,fl_mei,fl_ltda,fl_rm,fl_spa,fl_antt,...,nu_meses_rescencia_lt_1,empsetorcensitariofaixarendapopulacao_gt_6k,empsetorcensitariofaixarendapopulacao_lt_5001k_gt_5000k,empsetorcensitariofaixarendapopulacao_lt_4001k_gt_4000k,empsetorcensitariofaixarendapopulacao_lt_3001k_gt_3000k,empsetorcensitariofaixarendapopulacao_lt_2001k_gt_2000k,empsetorcensitariofaixarendapopulacao_lt_1001k_gt_1000k,empsetorcensitariofaixarendapopulacao_lt_500.5k_gt_500k,empsetorcensitariofaixarendapopulacao_lt_500,empsetorcensitariofaixarendapopulacao_NaN
0,dabe79bec87c88ae04e869bf6bd321ee5e1893cecf6625...,0.65,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,32e5f4e10932153a7ba869cb0386e7e02d49d2461046b8...,6.59,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,a95d6f30bba445bd3d6b0c5b36f865b38ec01d17336090...,8.01,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,6cb309685cea0b6d2988818792ec2e6fcb2bd02e0afa9e...,20.86,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,f72aa7fa6787b0a5a1c88885b6120850df8ee0f71adc25...,18.73,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
def load_data(df,filepath):
    '''
    Input
    pandas dataframe and the filepath to save the dataframe
    Output
    File with the pandas dataframe
    '''
    df.to_csv(filepath, index = False)
    
filepath_save = 'data/portfolio1_clean.csv'
load_data(df,filepath_save)