In [469]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.feature_selection import VarianceThreshold
from sklearn import metrics


import warnings
warnings.filterwarnings("ignore")

In [470]:
df = pd.read_csv('base_vigente_anos_anteriores.csv')
df.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [471]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 171 entries, class to eg_000
dtypes: int64(1), object(170)
memory usage: 78.3+ MB


In [472]:
df['class'].value_counts()

neg    59000
pos     1000
Name: class, dtype: int64

In [473]:
# Balanceando as classes
df1 = df[df['class'] == 'pos']
df2 = df[df['class'] == 'neg'].sample(1300)
df = df1.append(df2)

In [474]:
df['class'].value_counts()

neg    1300
pos    1000
Name: class, dtype: int64

In [475]:
df = df.replace('na', np.nan)

for col in df.columns[1:]:
    df[col] = df[col].replace(np.nan, round(df[col][~df[col].isna()].astype('float64').mean(), 3))
    
df.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
9,pos,153204,0.0,182.0,952.227,0.0,0.0,0,0,0,...,129862.0,26872,34044.0,22472.0,34362.0,0.0,0,0.0,0.0,0.0
23,pos,453236,1.046,2926.0,952.227,0.0,0.0,0,0,222,...,7908038.0,3026002,5025350.0,2025766.0,1160638.0,533834.0,493800,6914.0,0.0,0.0
60,pos,72504,1.046,1594.0,1052.0,0.0,0.0,0,244,178226,...,1432098.0,372252,527514.0,358274.0,332818.0,284178.0,3742,0.0,0.0,0.0
115,pos,762958,1.046,292451000.0,952.227,7.479,19.645,776,281128,2186308,...,2090330.0,956991,2006460.0,2160900.0,1753080.0,1192890.0,473874,25942.4,0.289,0.721
135,pos,695994,1.046,292451000.0,952.227,7.479,19.645,0,0,0,...,1397742.0,495544,361646.0,28610.0,5130.0,212.0,0,0.0,0.289,0.721


In [476]:
df['class'] = df['class'].replace('neg', 0)
df['class'] = df['class'].replace('pos', 1)

for col in df.columns:
    df[col] = df[col].astype('float64')
    
df['class'] = df['class'].astype(int)

In [477]:
var_zero = VarianceThreshold(threshold=0)
var_zero.fit(df)
col_var_zero = [x for x in df.columns if x not in df.columns[var_zero.get_support()]]
print('Quantidade de variáveis com variância igual a zero:','\n', len(col_var_zero))
print("")
print("Variáveis com variância igual a zero e removidas:","\n", col_var_zero)

# get.support() retorna todas as colunas com variância diferente de zero:

df = df[df.columns[var_zero.get_support()]]
print("")
print('Shape dos dados:', df.shape)

Quantidade de variáveis com variância igual a zero: 
 1

Variáveis com variância igual a zero e removidas: 
 ['cd_000']

Shape dos dados: (2300, 170)


In [478]:
var_zero_ = VarianceThreshold(threshold=0.03)
var_zero_.fit(df)
col_var_zero_ = [x for x in df.columns if x not in df.columns[var_zero_.get_support()]]
print('Quantidade de variáveis com variância perto de zero:','\n', len(col_var_zero_))
print("")
print("Variáveis com variância perto de zero e removidas:","\n", col_var_zero_)

df = df[df.columns[var_zero_.get_support()]]
print('Shape dos dados:', df.shape)

Quantidade de variáveis com variância perto de zero: 
 1

Variáveis com variância perto de zero e removidas: 
 ['ch_000']
Shape dos dados: (2300, 169)


In [479]:
def correlation(dataset, threshold):
    col_corr = set()  
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: 
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

corr_features = correlation(df.drop(['class'], axis=1), 0.9)
print('Variáveis correlacionadas e removidas: ', len(set(corr_features)))
print(corr_features)
df.drop(corr_features, axis=1, inplace=True)
print("")
print('Shape dos dados:', df.shape)

Variáveis correlacionadas e removidas:  41
{'bu_000', 'ci_000', 'cn_002', 'dn_000', 'dt_000', 'ed_000', 'bm_000', 'cn_001', 'cs_005', 'bh_000', 'ao_000', 'bg_000', 'bp_000', 'bb_000', 'bq_000', 'cq_000', 'aq_000', 'an_000', 'ba_004', 'bj_000', 'dc_000', 'by_000', 'ee_004', 'ba_002', 'ax_000', 'cc_000', 'ee_003', 'bt_000', 'ee_001', 'ag_004', 'bo_000', 'am_0', 'cn_003', 'br_000', 'ee_002', 'dp_000', 'bv_000', 'cn_005', 'ba_003', 'ba_005', 'ak_000'}

Shape dos dados: (2300, 128)


In [480]:
# def detect_outlier(data, threshold=3):
#     df_outlier = pd.DataFrame()
    
#     for col in data.columns:
#         mean, std = np.mean(data[col]), np.std(data[col])
#         outliers=[]
        
#         for y in data[col]:
#             z_score = (y - mean)/std
            
#             if np.abs(z_score) > threshold or np.abs(z_score) < -threshold:  
#                 outliers.append(y)
#         df2 = pd.DataFrame({'variável': [col], 'quantidade': [len(set(outliers))]})
#         df_outlier = df_outlier.append(df2)
#         df_outlier = df_outlier.loc[df_outlier.quantidade > 0]  
          
#     return df_outlier

# outliers = detect_outlier(df)
# outliers.head()

In [481]:
df.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,eb_000,ec_00,ee_000,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
9,1,153204.0,0.0,182.0,952.227,0.0,0.0,0.0,0.0,0.0,...,0.0,1.9,512878.0,22472.0,34362.0,0.0,0.0,0.0,0.0,0.0
23,1,453236.0,1.046,2926.0,952.227,0.0,0.0,0.0,0.0,222.0,...,432739600.0,5396.14,4079752.0,2025766.0,1160638.0,533834.0,493800.0,6914.0,0.0,0.0
60,1,72504.0,1.046,1594.0,1052.0,0.0,0.0,0.0,244.0,178226.0,...,1550150.0,2365.36,5935440.0,358274.0,332818.0,284178.0,3742.0,0.0,0.0,0.0
115,1,762958.0,1.046,292451100.0,952.227,7.479,19.645,776.0,281128.0,2186308.0,...,37788900.0,5396.14,3981364.518,2160904.771,1753080.835,1192894.542,473873.862,25942.408,0.289,0.721
135,1,695994.0,1.046,292451100.0,952.227,7.479,19.645,0.0,0.0,0.0,...,37788900.0,0.1,1073478.0,28610.0,5130.0,212.0,0.0,0.0,0.289,0.721


In [482]:
# Import dos Módulos
from sklearn.ensemble import ExtraTreesClassifier
import warnings
warnings.filterwarnings("ignore")

# Separando o array em componentes de input e output
X = df.drop(['class'], axis=1).values
Y = df['class'].values
# Criação do Modelo - Feature Selection
modelo = ExtraTreesClassifier()
modelo.fit(X, Y)

X_features = df.drop(['class'], axis=1)

# Lista de colunas a serem usadas para treinar cada modelo
features = [col for col in list(X_features) ]
importances =  modelo.feature_importances_
descending_indices = np.argsort(importances)[::-1]
sorted_importances = [importances[idx] for idx in descending_indices]
sorted_features = [features[idx] for idx in descending_indices]
print('As Variáveis mais importantes são %s' % sorted_features[0:5])

As Variáveis mais importantes são ['ck_000', 'bx_000', 'ah_000', 'cv_000', 'ag_005']


In [483]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

X = df.drop(['class'], axis=1).values
Y = df['class'].values

num_folds = 5

modelos = []
resultados = []
nomes = []

modelos.append(('KNN', KNeighborsClassifier()))
modelos.append(('LR', LogisticRegression()))
modelos.append(('Dt', DecisionTreeClassifier()))

for nome, modelo in modelos:
    kfold = KFold(num_folds, True)
    cv_results = cross_val_score(modelo, X, Y, cv = kfold, scoring = 'accuracy')
    resultados.append(cv_results)
    nomes.append(nome)
    msg = "%s: %f %f (%f)" % (nome, cv_results.mean(), cv_results.max(), cv_results.std())
    print(msg)

KNN: 0.943043 0.958696 (0.019657)
LR: 0.940435 0.947826 (0.007091)
Dt: 0.926522 0.936957 (0.008630)


In [484]:
# Otimização do valor de K

from sklearn.model_selection import train_test_split

# Divisão em dados de treino e de teste
X_treino, testeData, Y_treino, testeLabels = train_test_split(X, Y, test_size = 0.25)

# Range de valores de k que iremos testar
kVals = range(3, 25, 2)

# Lista vazia para receber as acurácias
acuracias = []

# Loop em todos os valores de k para testar cada um deles
for k in kVals:
    
    # Treinando o modelo KNN com cada valor de k
    modeloKNN = KNeighborsClassifier(n_neighbors = k)
    modeloKNN.fit(X_treino, Y_treino)
          
    # Avaliando o modelo e atualizando a lista de acurácias
    score = modeloKNN.score(testeData, testeLabels)
    print("Com valor de k = %d, a acurácia é = %.2f%%" % (k, score * 100))
    acuracias.append(score)

Com valor de k = 3, a acurácia é = 92.87%
Com valor de k = 5, a acurácia é = 93.22%
Com valor de k = 7, a acurácia é = 93.22%
Com valor de k = 9, a acurácia é = 93.22%
Com valor de k = 11, a acurácia é = 93.22%
Com valor de k = 13, a acurácia é = 93.57%
Com valor de k = 15, a acurácia é = 93.57%
Com valor de k = 17, a acurácia é = 92.87%
Com valor de k = 19, a acurácia é = 92.87%
Com valor de k = 21, a acurácia é = 92.87%
Com valor de k = 23, a acurácia é = 92.70%


In [485]:
modeloKNN = KNeighborsClassifier(n_neighbors = 11)
modeloKNN.fit(X_treino, Y_treino)

ypred = modeloKNN.predict(testeData)

In [486]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

ConfusionMatrix = confusion_matrix(testeLabels, ypred, )
print("Confusion Matrix:")
print(ConfusionMatrix, '\n')

ClassificationReport = classification_report(testeLabels, ypred)
print("Classification Report:",)
print (ClassificationReport, '\n')

accuracy = accuracy_score(testeLabels, ypred)
print("Accuracy:", accuracy)

Confusion Matrix:
[[314   8]
 [ 31 222]] 

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.98      0.94       322
           1       0.97      0.88      0.92       253

    accuracy                           0.93       575
   macro avg       0.94      0.93      0.93       575
weighted avg       0.93      0.93      0.93       575
 

Accuracy: 0.9321739130434783


In [487]:
df2 = pd.read_csv('base_vigente_2020.csv')
df2 = df2[list(df.columns)]

In [488]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2300 entries, 9 to 54796
Columns: 128 entries, class to eg_000
dtypes: float64(127), int32(1)
memory usage: 2.3 MB


In [489]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Columns: 128 entries, class to eg_000
dtypes: int64(1), object(127)
memory usage: 15.6+ MB


In [490]:
df2['class'].value_counts()

neg    15625
pos      375
Name: class, dtype: int64

In [491]:
df2['class'] = df2['class'].replace('neg', 0)
df2['class'] = df2['class'].replace('pos', 1)

In [492]:
df2 = df2.replace('na', np.nan)

for col in df2.columns[1:]:
    df2[col] = df2[col].replace(np.nan, round(df2[col][~df2[col].isna()].astype('float64').mean(), 3))

In [493]:
X2 = df2.drop(['class'], axis=1).values

ypred = modeloKNN.predict(X2)
df2['predict'] = ypred

df2.predict.value_counts()

0    14904
1     1096
Name: predict, dtype: int64

In [494]:
df2['class'].value_counts()

0    15625
1      375
Name: class, dtype: int64

In [495]:
df2[['class', 'predict']][df2[['class', 'predict']]['class']==1].predict.value_counts()

1    351
0     24
Name: predict, dtype: int64

In [496]:
df2[['class', 'predict']][df2[['class', 'predict']]['class']==0].predict.value_counts()

0    14880
1      745
Name: predict, dtype: int64

In [497]:
ConfusionMatrix = confusion_matrix(df2['class'].values, df2.predict.values,)
print("Confusion Matrix:")
print(ConfusionMatrix, '\n')

ClassificationReport = classification_report(df2['class'].values, df2.predict.values)
print("Classification Report:",)
print (ClassificationReport, '\n')

Confusion Matrix:
[[14880   745]
 [   24   351]] 

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     15625
           1       0.32      0.94      0.48       375

    accuracy                           0.95     16000
   macro avg       0.66      0.94      0.73     16000
weighted avg       0.98      0.95      0.96     16000
 



In [498]:
round(df2[df2['class'] == 0][['ck_000', 'bx_000', 'ah_000', 'cv_000', 'ag_005']].astype('float64').describe(), 0)

Unnamed: 0,ck_000,bx_000,ah_000,cv_000,ag_005
count,15625.0,15625.0,15625.0,15625.0,15625.0
mean,571550.0,3324478.0,1512760.0,1821463.0,889129.0
std,1671547.0,6910208.0,3203531.0,2608872.0,2308701.0
min,0.0,170.0,0.0,0.0,0.0
25%,14835.0,100490.0,31102.0,69758.0,14338.0
50%,246573.0,2393396.0,999984.0,1936070.0,173340.0
75%,531679.0,3943864.0,1565544.0,2170520.0,891568.0
max,41701121.0,113263092.0,45751370.0,55325344.0,40479252.0


In [499]:
round(df2[df2['class'] == 1][['ck_000', 'bx_000', 'ah_000', 'cv_000', 'ag_005']].astype('float64').describe(), 0)

Unnamed: 0,ck_000,bx_000,ah_000,cv_000,ag_005
count,375.0,375.0,375.0,375.0,375.0
mean,8520583.0,45171932.0,18791452.0,6711357.0,13122767.0
std,6936716.0,41577258.0,12540213.0,9949666.0,13860270.0
min,0.0,204.0,0.0,0.0,0.0
25%,4210270.0,16120902.0,8813818.0,1936070.0,4389609.0
50%,7018198.0,39936198.0,17476970.0,1936070.0,10280346.0
75%,10369141.0,66984928.0,26985229.0,6988510.0,17443843.0
max,52801907.0,531835592.0,82073576.0,67080280.0,168534742.0
