## Importações do Projeto

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import *
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import cross_val_score,cross_val_predict,KFold,train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Configurações Iniciais

In [2]:
def standardScalerFunc(data):
  ss = StandardScaler()
  sstransformed = ss.fit_transform(data)

  return ss, pd.DataFrame(sstransformed)

def minMaxScalerFunc(data):
  mm = MinMaxScaler()
  mmtransformed = mm.fit_transform(data)

  return mm, pd.DataFrame(mmtransformed)

def oneHotEncoderFunc(data):
  ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
  ohetransformed = ohe.fit_transform(data)
  
  return ohe, pd.DataFrame(ohetransformed)

### Pré Processamento dos Dados

In [3]:
def preproc(df):
    ## Numeric
    df.QT_DIA_SOLICITADO = df.QT_DIA_SOLICITADO.fillna(0)
    df.QT_TEMPO_DOENCA = df.QT_TEMPO_DOENCA.fillna(0)
    df.CD_GUIA_REFERENCIA = df.CD_GUIA_REFERENCIA.fillna(0)
    df.DT_NASCIMENTO = df.DT_NASCIMENTO.fillna(0)
     ## Categoric
    df.DS_INDICACAO_ACIDENTE = df.DS_INDICACAO_ACIDENTE.fillna('Outros')
    df.DS_TIPO_INTERNACAO = df.DS_TIPO_INTERNACAO.fillna('0')
    df.DS_TIPO_ACOMODACAO = df.DS_TIPO_ACOMODACAO.fillna('0')
    df.DS_TIPO_ATENDIMENTO = df.DS_TIPO_ATENDIMENTO.fillna('0')
    df.DS_REGIME_INTERNACAO = df.DS_REGIME_INTERNACAO.fillna('0')
    df.DS_UNIDADE_TEMPO_DOENCA = df.DS_UNIDADE_TEMPO_DOENCA.fillna('0')
    return df

### Configuração do Conjunto de Treino

In [4]:
os.listdir('/kaggle/input/competicao-um-ic/')
train_url = '../input/competicao-um-ic/train.csv'

train_data = pd.read_csv(train_url)

train_data = preproc(train_data)


numeric_classes = ['NR_SEQ_REQUISICAO','NR_SEQ_ITEM','DT_REQUISICAO','DT_NASCIMENTO', 'QT_SOLICITADA', 
                   'QT_TEMPO_DOENCA','QT_DIA_SOLICITADO','CD_ITEM','CD_GUIA_REFERENCIA']

categoric_classes = ['DS_REGIME_INTERNACAO','DS_CLASSE','DS_TIPO_ACOMODACAO','DS_TIPO_INTERNACAO', 
                     'DS_UNIDADE_TEMPO_DOENCA','DS_CARATER_ATENDIMENTO', 'DS_TIPO_ATENDIMENTO', 
                     'DS_INDICACAO_ACIDENTE', 'DS_TIPO_PREST_SOLICITANTE', 'DS_TIPO_GUIA', 'DS_TIPO_ITEM',
                     'DS_GRUPO', 'DS_CBO', 'DS_SUBGRUPO']

numeric = train_data[numeric_classes]
categoric = train_data[categoric_classes]

target = train_data['DS_STATUS_ITEM']

X_full = pd.concat([numeric, categoric], axis=1)
X_full = X_full.dropna()

y_full = target

## Encode X
mm, X_full_mm = minMaxScalerFunc(X_full[numeric_classes])
ohe, X_full_ohe = oneHotEncoderFunc(X_full[categoric_classes])

X_full = pd.concat([X_full_mm, X_full_ohe], axis=1)

## Separação de Dados

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_full,y_full, test_size=0.2,random_state=0)

print(X_train.shape)
print(X_test.shape)

(181697, 670)
(45425, 670)


## Configuração e Execução do Naive Bayes

In [6]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

y_true, y_pred = y_test, clf.predict(X_test)

print("Naive Bayes")
print(classification_report(y_true, y_pred))
print(accuracy_score(y_test, y_pred))
print()

Naive Bayes
              precision    recall  f1-score   support

  Autorizado       0.72      0.75      0.74     30832
      Negado       0.43      0.39      0.41     14593

    accuracy                           0.64     45425
   macro avg       0.58      0.57      0.57     45425
weighted avg       0.63      0.64      0.63     45425

0.6365657677490368



## Configuração e Execução do KNN

In [7]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train) 

y_true, y_pred = y_test, knn.predict(X_test)

print("KNN")
print(classification_report(y_true, y_pred))
print(accuracy_score(y_test, y_pred))
print()

KNN
              precision    recall  f1-score   support

  Autorizado       0.83      0.87      0.85     30832
      Negado       0.69      0.62      0.65     14593

    accuracy                           0.79     45425
   macro avg       0.76      0.74      0.75     45425
weighted avg       0.78      0.79      0.79     45425

0.7887947165657677



## Configuração e Execução do Decision Tree Classifier

In [8]:
tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)

y_true, y_pred = y_test, tree.predict(X_test)

print("Decision Tree")
print(classification_report(y_true, y_pred))
print(accuracy_score(y_test, y_pred))
print()

Decision Tree
              precision    recall  f1-score   support

  Autorizado       0.89      0.88      0.89     30832
      Negado       0.76      0.77      0.77     14593

    accuracy                           0.85     45425
   macro avg       0.83      0.83      0.83     45425
weighted avg       0.85      0.85      0.85     45425

0.8480132085855806



## Configuração e Execução do Decision Random Forest Classifier

In [9]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest.fit(X_train, y_train)

y_true, y_pred = y_test, random_forest.predict(X_test)

print("Random Forest")
print(classification_report(y_true, y_pred))
print(accuracy_score(y_test, y_pred))
print()

Random Forest
              precision    recall  f1-score   support

  Autorizado       0.86      0.91      0.88     30832
      Negado       0.78      0.69      0.73     14593

    accuracy                           0.84     45425
   macro avg       0.82      0.80      0.81     45425
weighted avg       0.83      0.84      0.83     45425

0.8376664832140892



## Cross Validation

In [10]:
k_folds = KFold(n_splits = 10)

scores = cross_val_score(random_forest, X_train, y_train, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Scores:  [0.83324161 0.83247111 0.83802972 0.83296643 0.84144194 0.83412218
 0.83714915 0.8365898  0.83444328 0.83389289]
Average CV Score:  0.8354348097191678
Number of CV Scores used in Average:  10


In [11]:
# y_true, y_pred = y_test, cross_val_predict(random_forest, x_test, y_test, cv=10)

# print("Random Forest CV")
# print(classification_report(y_true, y_pred))
# print(accuracy_score(y_test, y_pred))
# print()

## Configuração do Conjunto de Teste

In [12]:
os.listdir('/kaggle/input/competicao-um-ic/')
test_url = '../input/competicao-um-ic/test.csv'

test_data = pd.read_csv(test_url, sep=",", encoding='utf-8',low_memory=False)

test_data = preproc(test_data)

numeric = test_data[numeric_classes]

categoric = test_data[categoric_classes]

X = pd.concat([numeric, categoric], axis=1)
X = X.dropna()

test_mm = mm.transform(X[numeric_classes])
test_ohe = ohe.transform(X[categoric_classes])

X_test = pd.concat([pd.DataFrame(test_mm), pd.DataFrame(test_ohe)], axis=1)

## Configuração e Treino do Random Forest Classifier para Teste

In [None]:
random_forest_test = RandomForestClassifier(n_estimators=100, random_state=0)

random_forest_test.fit(X_full, y_full)

## Previsão com os Dados de Teste e Submissão

In [14]:
y_pred = random_forest_test.predict(X_test)

In [15]:
my_submission = pd.DataFrame({'ID': test_data['Unnamed: 0'],'DS_STATUS_ITEM':y_pred})

# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

my_submission

Unnamed: 0,ID,DS_STATUS_ITEM
0,0,Autorizado
1,5,Autorizado
2,8,Negado
3,13,Autorizado
4,18,Negado
...,...,...
186139,413210,Autorizado
186140,413218,Autorizado
186141,413221,Autorizado
186142,413248,Autorizado
