<a href="https://colab.research.google.com/github/Willian-BC/Projeto-Final/blob/main/Projeto_DataScience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Configurações Iniciais e download

In [None]:
!pip install wget

In [3]:
import pandas as pd
import numpy as np
import wget
from zipfile import ZipFile
from google.colab.data_table import DataTable
import plotly.graph_objects as go
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from subprocess import check_output

In [4]:
url_base = 'http://dados.cvm.gov.br/dados/CIA_ABERTA/DOC/DFP/DADOS/'
arquivos_zip = []
for ano in range(2010, 2021, 10):
  arquivos_zip.append(f'dfp_cia_aberta_{ano}.zip')

for arq in arquivos_zip:
  wget.download(url_base+arq)
  ZipFile(arq, 'r').extractall('CVM')

arquivos_zip

['dfp_cia_aberta_2010.zip', 'dfp_cia_aberta_2020.zip']

#Seleção dos dados

In [None]:
dre_10 = pd.read_csv(f'CVM/dfp_cia_aberta_DRE_ind_2010.csv',sep=';',decimal=',',encoding='ISO-8859-1')
dre_10['DS_CONTA'] = dre_10['DS_CONTA'].str.lower()
dre_10.VL_CONTA = dre_10.VL_CONTA.astype(float)
dre_10 = dre_10.pivot_table(index=['CD_CVM','CNPJ_CIA','DENOM_CIA','MOEDA','ESCALA_MOEDA','DT_INI_EXERC','DT_FIM_EXERC'], columns=['DS_CONTA'], values='VL_CONTA').reset_index().rename_axis(None, axis=1)#.fillna(0)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_rows', 99999)
dre_10.count()

In [None]:
#ENCONTRAR DS_CONTA PARA USAR NO MODELO
dre_10 = pd.read_csv(f'CVM/dfp_cia_aberta_DRE_ind_2010.csv',sep=';',decimal=',',encoding='ISO-8859-1')
dre_10['DS_CONTA'] = dre_10['DS_CONTA'].str.lower()
dre_10.VL_CONTA = dre_10.VL_CONTA.astype(float)
df = dre_10.loc[dre_10['DS_CONTA'].str.contains("lucro", case=False)]
DataTable(df[['DS_CONTA','VL_CONTA']].drop_duplicates(subset=['DS_CONTA'], keep='first'))
#SCRIPT APENAS PARA APOIO

#Tratamento DataFrame

In [None]:
dre_10 = pd.read_csv(f'CVM/dfp_cia_aberta_DRE_ind_2010.csv',sep=';',decimal=',',encoding='ISO-8859-1')
dre_10 = dre_10[dre_10.ORDEM_EXERC == 'ÚLTIMO']
dre_10 = dre_10[dre_10.DS_CONTA.isin(['Corrente', 'Custo dos Bens e/ou Serviços Vendidos', 'Despesas Financeiras', 'Despesas Gerais e Administrativas',
                                      'Despesas com Vendas','Despesas/Receitas Operacionais', 'Diferido', 'Lucro/Prejuízo do Período','Outras Despesas Operacionais',
                                      'Imposto de Renda e Contribuição Social sobre o Lucro', 'Outras Receitas Operacionais', 'Receitas Financeiras',
                                      'Perdas pela Não Recuperabilidade de Ativos', 'Receita de Venda de Bens e/ou Serviços', 'Resultado Bruto', 
                                      'Resultado Antes do Resultado Financeiro e dos Tributos','Resultado Antes dos Tributos sobre o Lucro', 'Resultado Financeiro',
                                      'Resultado Líquido das Operações Continuadas'])]
                                      
dre_10 = dre_10[['CD_CVM','CNPJ_CIA','DENOM_CIA','MOEDA','ESCALA_MOEDA','DT_INI_EXERC','DT_FIM_EXERC','DS_CONTA','VL_CONTA']] 
dre_10.VL_CONTA = dre_10.VL_CONTA.astype(float)
dre_10 = dre_10.pivot_table(index=['CD_CVM','CNPJ_CIA','DENOM_CIA','MOEDA','ESCALA_MOEDA','DT_INI_EXERC','DT_FIM_EXERC'], columns=['DS_CONTA'], values='VL_CONTA').reset_index().rename_axis(None, axis=1).fillna(0)

dre_20 = pd.read_csv(f'CVM/dfp_cia_aberta_DRE_ind_2020.csv',sep=';',decimal=',',encoding='ISO-8859-1')
dre_20 = dre_20[dre_20.ORDEM_EXERC == 'ÚLTIMO']

dre_10['Resultado'] = dre_10.CNPJ_CIA.isin(dre_20.CNPJ_CIA).replace({True: 'Não Faliu', False: 'Faliu'})
dre_10.describe()

In [None]:
DataTable.max_columns = 30
DataTable(dre_10)

In [None]:
color_list = ['red' if i=='Faliu' else 'green' for i in dre_10.loc[:,'Resultado']]
pd.plotting.scatter_matrix(dre_10.loc[:, dre_10.columns != 'Resultado'],
                                       c=color_list,
                                       figsize= [25,25],
                                       diagonal='hist',
                                       alpha=0.5,
                                       s = 200,
                                       marker = '*',
                                       edgecolor= "black")
plt.show()

In [None]:
sns.countplot(x="Resultado", data=dre_10)
dre_10.loc[:,'Resultado'].value_counts()

# Reference
https://www.kaggle.com/kanncaa1/machine-learning-tutorial-for-beginners

#KNN

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
x,y = dre_10.loc[:,['Corrente','Lucro/Prejuízo do Período']], dre_10.loc[:,'Resultado']
knn.fit(x,y)
prediction = knn.predict(x)
print('Prediction: {}'.format(prediction))

In [None]:
# train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)
knn = KNeighborsClassifier(n_neighbors = 3)
x,y = dre_10.loc[:,dre_10.columns != 'Resultado'], dre_10.loc[:,'Resultado']
knn.fit(x_train,y_train)
prediction = knn.predict(x_test)
#print('Prediction: {}'.format(prediction))
print('With KNN (K=3) accuracy is: ',knn.score(x_test,y_test)) # accuracy

In [None]:
# Model complexity
neig = np.arange(1, 25)
train_accuracy = []
test_accuracy = []
# Loop over different values of k
for i, k in enumerate(neig):
    # k from 1 to 25(exclude)
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train,y_train)
    train_accuracy.append(knn.score(x_train, y_train))
    test_accuracy.append(knn.score(x_test, y_test))

# Plot
plt.figure(figsize=[13,8])
plt.plot(neig, test_accuracy, label = 'Testing Accuracy')
plt.plot(neig, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.title('-value VS Accuracy')
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.xticks(neig)
plt.savefig('graph.png')
plt.show()
print("Best accuracy is {} with K = {}".format(np.max(test_accuracy),1+test_accuracy.index(np.max(test_accuracy))))

#Regressão Linear

In [None]:
df = dre_10[dre_10['Resultado'] =='Faliu']
x = np.array(df.loc[:,'Corrente']).reshape(-1,1)
y = np.array(df.loc[:,'Lucro/Prejuízo do Período']).reshape(-1,1)
# Scatter
plt.figure(figsize=[10,10])
plt.scatter(x=x,y=y)
plt.xlabel('Corrente')
plt.ylabel('Lucro/Prejuízo do Período')
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
predict_space = np.linspace(min(x), max(x)).reshape(-1,1)
reg.fit(x,y)
predicted = reg.predict(predict_space)
print('R^2 score: ',reg.score(x, y))
# Plot regression line and scatter
plt.plot(predict_space, predicted, color='black', linewidth=3)
plt.scatter(x=x,y=y)
plt.xlabel('Resultado Líquido das Operações Continuadas')
plt.ylabel('Lucro/Prejuízo do Período')
plt.show()

#Validação

In [None]:
# Cross Validation
from sklearn.model_selection import cross_val_score
reg = LinearRegression()
k = 5
cv_result = cross_val_score(reg,x,y,cv=k) # uses R^2 as score 
print('CV Scores: ',cv_result)
print('CV scores average: ',np.sum(cv_result)/k)

In [None]:
# Ridge
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 2, test_size = 0.3)
ridge = Ridge(alpha = 0.1, normalize = True)
ridge.fit(x_train,y_train)
ridge_predict = ridge.predict(x_test)
print('Ridge score: ',ridge.score(x_test,y_test))

In [None]:
# Lasso
from sklearn.linear_model import Lasso
x = np.array(df.loc[:,['Corrente','Lucro/Prejuízo do Período']])
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 3, test_size = 0.3)
lasso = Lasso(alpha = 0.1, normalize = True)
lasso.fit(x_train,y_train)
ridge_predict = lasso.predict(x_test)
print('Lasso score: ',lasso.score(x_test,y_test))
print('Lasso coefficients: ',lasso.coef_)

In [None]:
# Confusion matrix with random forest
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
x,y = dre_10.loc[:,['Corrente','Lucro/Prejuízo do Período']], dre_10.loc[:,'Resultado']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)
rf = RandomForestClassifier(random_state = 4)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
print('Confusion matrix: \n',cm)
print('Classification report: \n',classification_report(y_test,y_pred))

In [None]:
sns.heatmap(cm,annot=True,fmt="d") 
plt.show()

In [None]:
# ROC Curve with logistic regression
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
# Faliu = 1 and Não Faliu = 0
dre_10['Lucro/Prejuízo do Período'] = [1 if i == 'Faliu' else 0 for i in dre_10.loc[:,'Resultado']]
x,y = dre_10.loc[:,['Corrente', 'Custo dos Bens e/ou Serviços Vendidos', 'Despesas Financeiras', 'Despesas Gerais e Administrativas',
                                      'Despesas com Vendas','Despesas/Receitas Operacionais', 'Diferido', 'Lucro/Prejuízo do Período','Outras Despesas Operacionais',
                                      'Imposto de Renda e Contribuição Social sobre o Lucro', 'Outras Receitas Operacionais', 'Receitas Financeiras',
                                      'Perdas pela Não Recuperabilidade de Ativos', 'Receita de Venda de Bens e/ou Serviços', 'Resultado Bruto', 
                                      'Resultado Antes do Resultado Financeiro e dos Tributos','Resultado Antes dos Tributos sobre o Lucro', 'Resultado Financeiro',
                                      'Resultado Líquido das Operações Continuadas']], dre_10.loc[:,'Lucro/Prejuízo do Período']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=42)
logreg = LogisticRegression()
logreg.fit(x_train,y_train)
y_pred_prob = logreg.predict_proba(x_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.show()

In [None]:
# grid search cross validation with 1 hyperparameter
from sklearn.model_selection import GridSearchCV
grid = {'n_neighbors': np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, grid, cv=3) # GridSearchCV
knn_cv.fit(x,y)# Fit

# Print hyperparameter
print("Tuned hyperparameter k: {}".format(knn_cv.best_params_)) 
print("Best score: {}".format(knn_cv.best_score_))

#Treinar empresas com machine learning

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, precision_recall_curve


y = dre_10.Resultado #variavel dependente
x = dre_10.drop(['Resultado','CD_CVM','CNPJ_CIA','DENOM_CIA','MOEDA','ESCALA_MOEDA','DT_INI_EXERC','DT_FIM_EXERC'], axis=1) #variaveis independentes

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state = 0)

In [9]:
def CMatrix(CM, labels=['Faliu', 'Não faliu']):
  df = pd.DataFrame(data=CM, index=labels, columns=labels)
  df.index.name='TRUE'
  df.columns.name='PREDICTION'
  df.loc['Total'] = df.sum()
  df['Total'] = df.sum(axis=1)
  return df

In [10]:
metrics = pd.DataFrame(index=['Accuracy','Precision','Recall'],
                       columns=['LogisticReg','ClassTree','NaiveBayes'])

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression(n_jobs=-1, random_state=0)
logistic_regression.fit(x_train, y_train)
y_pred_test = logistic_regression.predict(x_test)
metrics.loc['Accuracy','LogisticReg'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['Precision','LogisticReg'] = precision_score(y_pred=y_pred_test, y_true=y_test, pos_label='Faliu')
metrics.loc['Recall','LogisticReg'] = recall_score(y_pred=y_pred_test, y_true=y_test, pos_label='Faliu')

CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
CMatrix(CM)

In [None]:
from sklearn.tree import DecisionTreeClassifier
class_tree = DecisionTreeClassifier(min_samples_split=30, min_samples_leaf=10, random_state=0)
class_tree.fit(x_train, y_train)
y_pred_test = class_tree.predict(x_test)
metrics.loc['Accuracy','ClassTree'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['Precision','ClassTree'] = precision_score(y_pred=y_pred_test, y_true=y_test, pos_label='Faliu')
metrics.loc['Recall','ClassTree'] = recall_score(y_pred=y_pred_test, y_true=y_test, pos_label='Faliu')

CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
CMatrix(CM)

In [None]:
from sklearn.naive_bayes import GaussianNB
NBC = GaussianNB()
NBC.fit(x_train, y_train)
y_pred_test = NBC.predict(x_test)
metrics.loc['Accuracy','NaiveBayes'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['Precision','NaiveBayes'] = precision_score(y_pred=y_pred_test, y_true=y_test, pos_label='Faliu')
metrics.loc['Recall','NaiveBayes'] = recall_score(y_pred=y_pred_test, y_true=y_test, pos_label='Faliu')

CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
CMatrix(CM)

In [None]:
100*metrics

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
metrics.plot(kind='barh', ax=ax)
ax.grid();

#Comparativo entre modelos

In [None]:
y_test = y_test.map({'Faliu': 1, 'Não Faliu': 0}).astype(int)
precision_nb, recall_nb, thresholds_nb = precision_recall_curve(y_true=y_test, probas_pred=class_tree.predict_proba(x_test)[:,1])
precision_lr, recall_lr, thresholds_lr = precision_recall_curve(y_true=y_test, probas_pred=logistic_regression.predict_proba(x_test)[:,1])

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(precision_nb, recall_nb, label='ClassTree')
ax.plot(precision_lr, recall_lr, label='LogisticReg')
ax.set_xlabel('Precision')
ax.set_ylabel('Recall')
ax.set_title('Precision-Recall Curve')
ax.hlines(y=0.5,xmin=0,xmax=1,color='red')
ax.legend()
ax.grid()

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(thresholds_lr, precision_lr[1:], label='Precision')
ax.plot(thresholds_lr, recall_lr[1:], label='Recall')
ax.set_xlabel('Classification Threshold')
ax.set_ylabel('Precision, Recall')
ax.set_title('Logistic Regression Classifier: Precision-Recall')
#ax.hlines(y=0.6,xmin=0,xmax=1,color='red')
ax.vlines(x=0.4,ymin=0,ymax=1,color='red')
ax.legend()
ax.grid()

In [None]:
y_pred_proba = logistic_regression.predict_proba(x_test)[:,1]
y_pred_test = (y_pred_proba >= 0.4).astype('int')
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print("Recall: ", 100*recall_score(y_pred=y_pred_test, y_true=y_test))
print("Precision: ", 100*precision_score(y_pred=y_pred_test, y_true=y_test))
CMatrix(CM)

#individual predictions
https://www.youtube.com/watch?v=zUqa6KcwRhs

In [56]:
def ind_prediction(nova_empresa):
  empresa = nova_empresa.values.reshape(1,-1)
  #empresa = robust_scaler.transform(empresa)
  prob = logistic_regression.predict_proba(empresa)[0][1]
  return prob

In [82]:
dre_20 = pd.read_csv(f'CVM/dfp_cia_aberta_DRE_ind_2020.csv',sep=';',decimal=',',encoding='ISO-8859-1')
dre_20 = dre_20[dre_20.ORDEM_EXERC == 'ÚLTIMO']
dre_20 = dre_20[dre_20.DS_CONTA.isin(['Corrente', 'Custo dos Bens e/ou Serviços Vendidos', 'Despesas Financeiras', 'Despesas Gerais e Administrativas',
                                      'Despesas com Vendas','Despesas/Receitas Operacionais', 'Diferido', 'Lucro/Prejuízo do Período','Outras Despesas Operacionais',
                                      'Imposto de Renda e Contribuição Social sobre o Lucro', 'Outras Receitas Operacionais', 'Receitas Financeiras',
                                      'Perdas pela Não Recuperabilidade de Ativos', 'Receita de Venda de Bens e/ou Serviços', 'Resultado Bruto', 
                                      'Resultado Antes do Resultado Financeiro e dos Tributos','Resultado Antes dos Tributos sobre o Lucro', 'Resultado Financeiro',
                                      'Resultado Líquido das Operações Continuadas'])]
                                      
dre_20 = dre_20[['CD_CVM','CNPJ_CIA','DENOM_CIA','MOEDA','ESCALA_MOEDA','DT_INI_EXERC','DT_FIM_EXERC','DS_CONTA','VL_CONTA']]
dre_20 = dre_20[dre_20['DENOM_CIA'].dropna().str.contains("EM RECUPERAÇÃO JUDICIAL")] 
dre_20.VL_CONTA = dre_20.VL_CONTA.astype(float)
dre_20 = dre_20.pivot_table(index=['CD_CVM','CNPJ_CIA','DENOM_CIA','MOEDA','ESCALA_MOEDA','DT_INI_EXERC','DT_FIM_EXERC'], columns=['DS_CONTA'], values='VL_CONTA').reset_index().rename_axis(None, axis=1).fillna(0)
DataTable(dre_20)



Unnamed: 0,CD_CVM,CNPJ_CIA,DENOM_CIA,MOEDA,ESCALA_MOEDA,DT_INI_EXERC,DT_FIM_EXERC,Corrente,Custo dos Bens e/ou Serviços Vendidos,Despesas Financeiras,Despesas Gerais e Administrativas,Despesas com Vendas,Despesas/Receitas Operacionais,Diferido,Imposto de Renda e Contribuição Social sobre o Lucro,Lucro/Prejuízo do Período,Outras Despesas Operacionais,Outras Receitas Operacionais,Perdas pela Não Recuperabilidade de Ativos,Receita de Venda de Bens e/ou Serviços,Receitas Financeiras,Resultado Antes do Resultado Financeiro e dos Tributos,Resultado Antes dos Tributos sobre o Lucro,Resultado Bruto,Resultado Financeiro,Resultado Líquido das Operações Continuadas
0,1520,60.851.615/0001-53,BARDELLA S.A. INDUSTRIAS MECANICAS EM RECUPERA...,REAL,MIL,2020-01-01,2020-12-31,0.0,-24896.0,-18497.0,-13638.0,-2517.0,-51475.0,10821.0,10821.0,-65455.0,-39025.0,18478.0,0.0,13259.0,5333.0,-63112.0,-76276.0,-11637.0,-13164.0,-65455.0
1,4685,09.116.278/0001-01,CONPEL CIA NORDESTINA PAPEL - EM RECUPERAÇÃO J...,REAL,UNIDADE,2020-01-01,2020-12-31,0.0,-42666.0,-11813.0,-5340.0,-1613.0,-1907.0,644.0,644.0,-12031.0,0.0,5046.0,0.0,43709.0,2.0,-864.0,-12675.0,1043.0,-11811.0,-12031.0
2,5762,61.092.037/0001-81,ETERNIT S.A. - EM RECUPERAÇÃO JUDICIAL,REAL,MIL,2020-01-01,2020-12-31,-8939.0,-385450.0,-10846.0,-45675.0,-37443.0,38059.0,-12117.0,-21056.0,158751.0,49193.0,52061.0,0.0,513874.0,24170.0,166483.0,179807.0,128424.0,13324.0,158751.0
3,6700,33.200.049/0001-47,HOTEIS OTHON S.A. - EM RECUPERAÇÃO JUDICIAL,REAL,MIL,2020-01-01,2020-12-31,0.0,-18778.0,-32811.0,-66127.0,-2832.0,-86986.0,1816.0,1816.0,-90774.0,-4402.0,1724.0,0.0,35782.0,18621.0,-69982.0,-84172.0,17004.0,-14190.0,-82356.0
4,7811,33.035.536/0001-00,JOAO FORTES ENGENHARIA S.A. - EM RECUPERAÇÃO J...,REAL,MIL,2020-01-01,2020-12-31,0.0,2240.0,-25843.0,-18107.0,-42054.0,-149029.0,1302.0,1302.0,-168657.0,-5603.0,14.0,0.0,-5094.0,7767.0,-151883.0,-169959.0,-2854.0,-18076.0,-168657.0
5,10472,60.500.139/0001-26,SARAIVA LIVREIROS S.A. - EM RECUPERAÇÃO JUDICIAL,REAL,MIL,2020-01-01,2020-12-31,0.0,0.0,-1162.0,-3881.0,0.0,-412376.0,0.0,0.0,-419595.0,-1456.0,0.0,0.0,0.0,1456.0,-412376.0,-412082.0,0.0,294.0,-412082.0
6,11207,33.111.246/0001-90,TECNOSOLO ENGENHARIA S.A. - EMPRESA EM RECUPER...,REAL,MIL,2020-01-01,2020-12-31,0.0,0.0,0.0,-3972.0,0.0,-706.0,0.0,0.0,-706.0,2839.0,0.0,0.0,0.0,0.0,-706.0,-706.0,0.0,0.0,-706.0
7,11223,82.636.986/0001-55,TEKA-TECELAGEM KUEHNRICH S.A. - EM RECUPERAÇÃO...,REAL,MIL,2020-01-01,2020-12-31,0.0,-107742.0,-128851.0,-9437.0,-17702.0,-25618.0,2975.0,2975.0,-115783.0,-23714.0,23362.0,0.0,131978.0,11475.0,-1382.0,-118758.0,24236.0,-117376.0,-115783.0
8,11312,76.535.764/0001-43,OI S.A. - EM RECUPERAÇÃO JUDICIAL,REAL,MIL,2020-01-01,2020-12-31,680.0,-2669653.0,-14502225.0,-873266.0,-586690.0,-11490951.0,3736483.0,3737163.0,-10529963.0,-921245.0,2383573.0,0.0,3545254.0,10850449.0,-10615350.0,-14267126.0,875601.0,-3651776.0,-10529963.0
9,11991,84.683.671/0001-94,WETZEL S.A. EM RECUPERAÇÃO JUDICIAL,REAL,MIL,2020-01-01,2020-12-31,-5593.0,-129921.0,-11764.5,-14315.0,-12132.0,30296.0,-3030.0,-8623.0,24873.0,22711.5,11333.0,0.0,146264.0,189.0,46639.0,33496.0,16343.0,-13143.0,24873.0


In [None]:
dre_20 = dre_20.drop(['CD_CVM','CNPJ_CIA','DENOM_CIA','MOEDA','ESCALA_MOEDA','DT_INI_EXERC','DT_FIM_EXERC'], axis=1)

In [77]:
teste_empresa = dre_20.iloc[0]
ind_prediction(teste_empresa)

0.4856613760169589

#FIM

Dados publicos RF (https://github.com/fabioserpa/CNPJ-full)

In [None]:
# Baggin : Bootstrap AGGregating

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset (x_train, x_test, y_train, y_test):
  model = RandomForestRegressor(n_estimators=100, random_state=0)
  model.fit(x_train, y_train)
  preds = model.predict(x_test)
  return mean_absolute_error(y_test, preds) #Quanto menor melhor