In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Configuração de impressão Numpy
np.set_printoptions(suppress=True,linewidth = 200, precision = 2)

In [3]:
#Carregando o dataset
dados = np.genfromtxt("dataset1.csv",delimiter = ';',skip_header = 1,autostrip = True,encoding = 'cp1252')
dados.view()

array([[48010226.  ,         nan,    35000.  , ...,         nan,         nan,     9452.96],
       [57693261.  ,         nan,    30000.  , ...,         nan,         nan,     4679.7 ],
       [59432726.  ,         nan,    15000.  , ...,         nan,         nan,     1969.83],
       ...,
       [50415990.  ,         nan,    10000.  , ...,         nan,         nan,     2185.64],
       [46154151.  ,         nan,         nan, ...,         nan,         nan,     3199.4 ],
       [66055249.  ,         nan,    10000.  , ...,         nan,         nan,      301.9 ]])

os nan são em razão de caracteres não numéricos, que não foram carregados dá melhor forma pelo Numpy

In [4]:
#Verificando os valores nan nos dados
np.isnan(dados).sum()

88005

In [5]:
#Iremos substituir os valores nan no dataset por um valor coringa, que irá corresponder ao valor máximo do dataset + 1
valor_coringa = np.nanmax(dados) + 1 #numero maximo dentro de "dados" ignorando o nan
print(valor_coringa) 

68616520.0


In [6]:
#Usaremos agora uma ténica para saber quais colunas tem valores numéricos e quais tem tipo string
media_ignora_nan = np.nanmean(dados,axis = 0)
media_ignora_nan

array([54015809.19,         nan,    15273.46,         nan,    15311.04,         nan,       16.62,      440.92,         nan,         nan,         nan,         nan,         nan,     3143.85])

In [7]:
#Onde tiver nan no array media_ignora_nan, aquela coluna é do tipo string
colunas_string = np.argwhere(np.isnan(media_ignora_nan)).squeeze()
colunas_string

array([ 1,  3,  5,  8,  9, 10, 11, 12], dtype=int64)

In [8]:
#Onde não tiver nan no array media_ignora_nan, aquela coluna é do tipo numérica
colunas_num = np.argwhere(np.isnan(media_ignora_nan)==False).squeeze()
colunas_num

array([ 0,  2,  4,  6,  7, 13], dtype=int64)

In [9]:
#Importando novamente o dataset, mas separando colunas numéricas e strings
array_string = np.genfromtxt('dataset1.csv',delimiter = ';',skip_header = 1,autostrip = True,\
                             usecols = colunas_string,dtype = str,encoding = 'cp1252' )
array_string

array([['May-15', 'Current', '36 months', ..., 'Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']], dtype='<U69')

In [10]:
#Carregando as colunas do tipo numérico preenchendo os valores ausentes
array_num = np.genfromtxt('dataset1.csv',delimiter = ';',skip_header = 1,autostrip = True,\
                         usecols = colunas_num,filling_values = valor_coringa,encoding = 'cp1252')
array_num

array([[48010226.  ,    35000.  ,    35000.  ,       13.33,     1184.86,     9452.96],
       [57693261.  ,    30000.  ,    30000.  , 68616520.  ,      938.57,     4679.7 ],
       [59432726.  ,    15000.  ,    15000.  , 68616520.  ,      494.86,     1969.83],
       ...,
       [50415990.  ,    10000.  ,    10000.  , 68616520.  , 68616520.  ,     2185.64],
       [46154151.  , 68616520.  ,    10000.  ,       16.55,      354.3 ,     3199.4 ],
       [66055249.  ,    10000.  ,    10000.  , 68616520.  ,      309.97,      301.9 ]])

In [11]:
#Carregando os nomes das colunas
array_nomes_colunas = np.genfromtxt('dataset1.csv',delimiter = ';',skip_footer = dados.shape[0],dtype = str,encoding = 'cp1252',\
                                   autostrip = True)
array_nomes_colunas

array(['id', 'issue_d', 'loan_amnt', 'loan_status', 'funded_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'verification_status', 'url', 'addr_state', 'total_pymnt'], dtype='<U19')

In [12]:
#Nomes das colunas numéricas e strings
header_num, header_str = array_nomes_colunas[colunas_num],array_nomes_colunas[colunas_string]

In [13]:
header_num

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt'], dtype='<U19')

In [14]:
header_str

array(['issue_d', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url', 'addr_state'], dtype='<U19')

In [15]:
#Criação de função de checkpoint para salvar os resultados intermediários
def checkpoint(file_name,header,checkpoint_data):
    np.savez(file_name,header = header,data = checkpoint_data)
    checkpoint_variable = np.load(file_name + ".npz")
    return (checkpoint_variable)

In [16]:
checkpoint_inicial = checkpoint("checkpoint-inicial",header_str,array_string)

In [17]:
checkpoint_inicial['data']

array([['May-15', 'Current', '36 months', ..., 'Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']], dtype='<U69')

In [18]:
#Verificando se o array do checkpoint está igual ao array inicial
np.array_equal(checkpoint_inicial['data'],array_string)

True

## Tratamento das colunas str

In [19]:
header_str[0] = 'issue_date'

In [20]:
header_str

array(['issue_date', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url', 'addr_state'], dtype='<U19')

In [21]:
array_string

array([['May-15', 'Current', '36 months', ..., 'Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']], dtype='<U69')

### Variávei issue_date

In [22]:
np.unique(array_string[:,0])

array(['', 'Apr-15', 'Aug-15', 'Dec-15', 'Feb-15', 'Jan-15', 'Jul-15', 'Jun-15', 'Mar-15', 'May-15', 'Nov-15', 'Oct-15', 'Sep-15'], dtype='<U69')

In [23]:
#Remover o sufixo -15 
array_string[:,0] = np.chararray.strip(array_string[:,0],"-15")

In [24]:
np.unique(array_string[:,0])

array(['', 'Apr', 'Aug', 'Dec', 'Feb', 'Jan', 'Jul', 'Jun', 'Mar', 'May', 'Nov', 'Oct', 'Sep'], dtype='<U69')

In [25]:
meses = np.array(['','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

In [26]:
for i in range(0,13):
    array_string[:,0] = np.where(array_string[:,0]==meses[i],i,array_string[:,0])
    

In [27]:
np.unique(array_string[:,0])

array(['0', '1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='<U69')

### Variável loan_status

In [28]:
np.unique(array_string[:,1]).size

9

In [29]:
np.unique(array_string[:,1])

array(['', 'Charged Off', 'Current', 'Default', 'Fully Paid', 'In Grace Period', 'Issued', 'Late (16-30 days)', 'Late (31-120 days)'], dtype='<U69')

In [30]:
#Criando um array com apenas 3 status
status_bad = np.array(['','Issued','Late (31-120 days)','Default'])

In [31]:
#se os status em loan_status estiverem dentro do np.array status_bad, recebem "1". Caso não, recebem "0"
array_string[:,1] = np.where(np.isin(array_string[:,1],status_bad),1,0)

In [32]:
np.unique(array_string[:,1])

array(['0', '1'], dtype='<U69')

### Variável term

In [33]:
np.unique(array_string[:,2])

array(['', '36 months', '60 months'], dtype='<U69')

In [34]:
#Tirando o months 
header_str[2] = 'term_month'
array_string[:,2] = np.chararray.strip(array_string[:,2]," months")
array_string[:,2]

array(['36', '36', '36', ..., '36', '36', '36'], dtype='<U69')

In [35]:
#Substituindo os valores ausentes pelo maior valor (60 months)
array_string[:,2] = np.where(array_string[:,2] == "",60,array_string[:,2])

In [36]:
np.unique(array_string[:,2])

array(['36', '60'], dtype='<U69')

### Variáveis grade e sub_grade

In [37]:
np.unique(array_string[:,3])

array(['', 'A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='<U69')

In [38]:
np.unique(array_string[:,4])

array(['', 'A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1',
       'G2', 'G3', 'G4', 'G5'], dtype='<U69')

In [39]:
np.unique(array_string[:,3])[1:]

array(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='<U69')

In [40]:
#Loop para ajuste da variável sub_grade
for i in np.unique(array_string[:,3])[1:]:
    array_string[:,4] = np.where((array_string[:,4] == "") & (array_string[:,3]==i),i+'5',array_string[:,4])

In [41]:
np.unique(array_string[:,4],return_counts = True)

(array(['', 'A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1',
        'G2', 'G3', 'G4', 'G5'], dtype='<U69'),
 array([  9, 285, 278, 239, 323, 592, 509, 517, 530, 553, 633, 629, 567, 586, 564, 577, 391, 267, 250, 255, 288, 235, 162, 171, 139, 160,  94,  52,  34,  43,  24,  19,  10,   3,   7,   5], dtype=int64))

In [42]:
#Substituindo os valores por uma nova categoria
array_string[:,4] = np.where(array_string[:,4]=='','H1',array_string[:,4])

In [43]:
np.unique(array_string[:,4])

array(['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2',
       'G3', 'G4', 'G5', 'H1'], dtype='<U69')

In [44]:
#Removendo a variável grade
array_string = np.delete(array_string,3,axis = 1)

In [45]:
#Nova variável de índice 3
array_string[:,3]

array(['C3', 'A5', 'B5', ..., 'A5', 'D2', 'A4'], dtype='<U69')

In [46]:
#Removendo a coluna do array de colunas
header_str = np.delete(header_str,3)

In [47]:
np.unique(array_string[:,3])

array(['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2',
       'G3', 'G4', 'G5', 'H1'], dtype='<U69')

In [48]:
#Lista de chaves
keys = list(np.unique(array_string[:,3]))
keys[0]

'A1'

In [49]:
#Lista de valores
values = list(range(1,np.unique(array_string[:,3]).shape[0] + 1))
values[0]

1

In [50]:
dict_subgrade = dict(zip(keys,values))

In [51]:
dict_subgrade

{'A1': 1,
 'A2': 2,
 'A3': 3,
 'A4': 4,
 'A5': 5,
 'B1': 6,
 'B2': 7,
 'B3': 8,
 'B4': 9,
 'B5': 10,
 'C1': 11,
 'C2': 12,
 'C3': 13,
 'C4': 14,
 'C5': 15,
 'D1': 16,
 'D2': 17,
 'D3': 18,
 'D4': 19,
 'D5': 20,
 'E1': 21,
 'E2': 22,
 'E3': 23,
 'E4': 24,
 'E5': 25,
 'F1': 26,
 'F2': 27,
 'F3': 28,
 'F4': 29,
 'F5': 30,
 'G1': 31,
 'G2': 32,
 'G3': 33,
 'G4': 34,
 'G5': 35,
 'H1': 36}

In [52]:
for i in np.unique(array_string[:,3]):
    array_string[:,3] = np.where(array_string[:,3]==i,dict_subgrade[i],array_string[:,3])

In [53]:
np.unique(array_string[:,3])

array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '4', '5', '6',
       '7', '8', '9'], dtype='<U69')

In [54]:
header_str

array(['issue_date', 'loan_status', 'term_month', 'sub_grade', 'verification_status', 'url', 'addr_state'], dtype='<U19')

### Variável verification_status

In [55]:
np.unique(array_string[:,4])

array(['', 'Not Verified', 'Source Verified', 'Verified'], dtype='<U69')

In [56]:
array_string[:,4] = np.where((array_string[:,4] == '') | (array_string[:,4] == 'Not Verified'),0,1)

In [57]:
np.unique(array_string[:,4])

array(['0', '1'], dtype='<U69')

### Variável url

In [58]:
array_string[:,5]

array(['https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261',
       'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', ..., 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990',
       'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249'], dtype='<U69')

In [59]:
#Extraindo o id ao fim de cada url
array_string[:,5] = np.chararray.strip(array_string[:,5],"https://www.lendingclub.com/browse/loanDetail.action?loan_id=")

In [60]:
#Transformando em inteiros
array_string[:,5].astype(dtype = np.int32)

array([48010226, 57693261, 59432726, ..., 50415990, 46154151, 66055249])

In [61]:
#Esse id já está no array_num, portanto vamos remover essa coluna
array_string = np.delete(array_string,5,axis = 1)

In [62]:
header_str = np.delete(header_str,5)

### Variável addr_state

In [63]:
#Removendo a variável
array_string = np.delete(array_string,5,axis = 1)

In [64]:
header_str = np.delete(header_str,5)

In [65]:
array_string.shape

(10000, 5)

In [66]:
header_str

array(['issue_date', 'loan_status', 'term_month', 'sub_grade', 'verification_status'], dtype='<U19')

In [67]:
#Convertendo o array de strings para o tipo numérico
array_string = array_string.astype(int)

In [68]:
array_string.dtype

dtype('int32')

## Segunda função checkpoint

In [69]:
checkpoint_strings = checkpoint('checkpoint_strings',header_str,array_string)

## Tratamento das colunas numéricas

In [70]:
array_num

array([[48010226.  ,    35000.  ,    35000.  ,       13.33,     1184.86,     9452.96],
       [57693261.  ,    30000.  ,    30000.  , 68616520.  ,      938.57,     4679.7 ],
       [59432726.  ,    15000.  ,    15000.  , 68616520.  ,      494.86,     1969.83],
       ...,
       [50415990.  ,    10000.  ,    10000.  , 68616520.  , 68616520.  ,     2185.64],
       [46154151.  , 68616520.  ,    10000.  ,       16.55,      354.3 ,     3199.4 ],
       [66055249.  ,    10000.  ,    10000.  , 68616520.  ,      309.97,      301.9 ]])

In [71]:
header_num

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt'], dtype='<U19')

In [72]:
#Quando carregamos os dados, substituiu-se ps valores ausentes por um valor coringa
valor_coringa

68616520.0

In [73]:
#Criando um array com o mínimo, média e máximo ignorando nan
array_stats = np.array([np.nanmin(dados,axis = 0),media_ignora_nan,np.nanmax(dados,axis = 0)])

In [74]:
array_stats[:,colunas_num]

array([[  373332.  ,     1000.  ,     1000.  ,        6.  ,       31.42,        0.  ],
       [54015809.19,    15273.46,    15311.04,       16.62,      440.92,     3143.85],
       [68616519.  ,    35000.  ,    35000.  ,       28.99,     1372.97,    41913.62]])

### Variável funded_amnt

In [75]:
array_num[:,2]

array([35000., 30000., 15000., ..., 10000., 10000., 10000.])

In [76]:
#Ajustamos o valor coringa para o valor mínimo 
array_num[:,2] = np.where(array_num[:,2] == valor_coringa,array_stats[:,colunas_num][0,2],array_num[:,2])

In [77]:
array_num[:,2]

array([35000., 30000., 15000., ..., 10000., 10000., 10000.])

### Variáveis loan_amnt, int_rate, installment, total_pymnt

In [78]:
#Valor máximo
for i in [1,3,4,5]:
    array_num[:,i] = np.where(array_num[:,i] == valor_coringa,array_stats[:,colunas_num][2,i],array_num[:,i])

In [79]:
array_num

array([[48010226.  ,    35000.  ,    35000.  ,       13.33,     1184.86,     9452.96],
       [57693261.  ,    30000.  ,    30000.  ,       28.99,      938.57,     4679.7 ],
       [59432726.  ,    15000.  ,    15000.  ,       28.99,      494.86,     1969.83],
       ...,
       [50415990.  ,    10000.  ,    10000.  ,       28.99,     1372.97,     2185.64],
       [46154151.  ,    35000.  ,    10000.  ,       16.55,      354.3 ,     3199.4 ],
       [66055249.  ,    10000.  ,    10000.  ,       28.99,      309.97,      301.9 ]])

## Dataset de cotação USD - EURO

In [80]:
dados_cot = np.genfromtxt("dataset2.csv", 
                          delimiter = ',', 
                          autostrip = True, 
                          skip_header = 1, 
                          usecols = 3)

In [81]:
dados_cot

array([1.13, 1.12, 1.08, 1.11, 1.1 , 1.12, 1.09, 1.13, 1.13, 1.1 , 1.06, 1.09])

In [82]:
header_str

array(['issue_date', 'loan_status', 'term_month', 'sub_grade', 'verification_status'], dtype='<U19')

In [84]:
array_string

array([[ 5,  0, 36, 13,  1],
       [ 0,  0, 36,  5,  1],
       [ 9,  0, 36, 10,  1],
       ...,
       [ 6,  0, 36,  5,  1],
       [ 4,  0, 36, 17,  1],
       [12,  0, 36,  4,  0]])

In [85]:
#A coluna 0 do array de strings é o mês
exchange = array_string[:,0]

In [87]:
for i in range(1,13):
    exchange = np.where(exchange == i,dados_cot[i-1],exchange)

In [89]:
#se tiver nulos, substituimos pela media
exchange = np.where(exchange == 0,np.mean(dados_cot),exchange)

In [90]:
exchange

array([1.1 , 1.11, 1.13, ..., 1.12, 1.11, 1.09])

In [92]:
#Esse array será adicionado ao array de numeros. Vamos ver se os shapes correspondem
exchange.shape

(10000,)

In [93]:
array_num.shape

(10000, 6)

In [95]:
#Mudando o shape do exchange
exchange = np.reshape(exchange,(10000,1))

In [96]:
#Juntando os arrays
array_num = np.hstack((array_num,exchange))

In [97]:
#Acrescentando o cabeçalho
header_num = np.concatenate((header_num,np.array(['exchange'])))

In [98]:
colunas_dollar = np.array([1,2,4,5])

In [99]:
array_num[:,6]

array([1.1 , 1.11, 1.13, ..., 1.12, 1.11, 1.09])

In [101]:
for i in colunas_dollar:
    array_num = np.hstack((array_num, np.reshape(array_num[:,i] / array_num[:,6],(10000,1))))

In [102]:
header_add = np.array([nome_coluna + '_EUR' for nome_coluna in header_num[colunas_dollar]])

In [103]:
header_num = np.concatenate((header_num,header_add))

In [104]:
header_num [colunas_dollar] = np.array([nome_coluna+'_USD' for nome_coluna in header_num[colunas_dollar]])

In [105]:
columns_index_order = [0,1,7,2,8,3,4,9,5,10,6]

In [106]:
header_num = header_num[columns_index_order]

In [107]:
array_num = array_num[:,columns_index_order]

### Variável int_rate

In [109]:
array_num[:,5] = array_num[:,5]/100

In [110]:
array_num[:,5]

array([0.13, 0.29, 0.29, ..., 0.29, 0.17, 0.29])

## Construindo o dataset final

In [112]:
df_final = np.hstack((array_num,array_string))

In [113]:
df_final

array([[48010226.  ,    35000.  ,    31933.3 , ...,       36.  ,       13.  ,        1.  ],
       [57693261.  ,    30000.  ,    27132.46, ...,       36.  ,        5.  ,        1.  ],
       [59432726.  ,    15000.  ,    13326.3 , ...,       36.  ,       10.  ,        1.  ],
       ...,
       [50415990.  ,    10000.  ,     8910.3 , ...,       36.  ,        5.  ,        1.  ],
       [46154151.  ,    35000.  ,    31490.9 , ...,       36.  ,       17.  ,        1.  ],
       [66055249.  ,    10000.  ,     9145.8 , ...,       36.  ,        4.  ,        0.  ]])

In [116]:
header_full = np.concatenate((header_num,header_str))

In [117]:
df_final = np.vstack((header_full,df_final))

In [118]:
#Salvando o dataset final
np.savetxt('dataset_final_processado.csv',df_final,fmt = '%s',delimiter = ',')