# Importação de base de dados

In [7]:
import numpy as np
import pandas as pd
import os

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import InputLayer, Dense


import matplotlib.pyplot as plt

In [8]:
TRN_X = pd.read_csv('TRN_X.csv')
TRN_Y = pd.read_csv('TRN_Y.csv')

TST_X = pd.read_csv('TST_X.csv')
TST_Y = pd.read_csv('TST_Y.csv')

In [9]:
TRN_X.fillna(0, inplace=True)
TRN_Y.fillna(0, inplace=True)

In [10]:
TRN_X.head()

Unnamed: 0,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,...,onlineStatus_n,onlineStatus_y,availability_ausente,availability_completely not determinable,availability_completely not orderable,availability_completely orderable,availability_mainly not determinable,availability_mainly not orderable,availability_mainly orderable,availability_mixed
0,6,5,39.887,1,59.99,59.99,59.99,1,59.99,59.99,...,0,1,0,0,0,1,0,0,0,0
1,6,5,94.469,0,14.99,49.99,358.935,0,19.99,39.99,...,0,1,0,0,0,1,0,0,0,0
2,6,5,341.613,11,9.99,29.99,109.95,2,9.99,29.99,...,0,1,0,0,0,1,0,0,0,0
3,6,5,42.812,4,4.99,4.99,19.96,1,4.99,4.99,...,0,1,0,0,1,0,0,0,0,0
4,6,5,2816.046,45,12.99,179.95,1093.72,4,19.99,27.85,...,0,1,0,0,0,1,0,0,0,0


In [11]:
# size of TRN_Y
TRN_Y.loc['1', TRN_Y.columns] = 1

# Classificador Random Forest

In [12]:
#Classificador Random Forest

RCF = RandomForestClassifier(n_estimators=100, max_features=10, random_state=42)

X = TRN_X
y = TRN_Y

kf = KFold(n_splits=10, random_state=42, shuffle=True)
split_num = 1
for train_index, test_index in kf.split(X):
    print("\nNÚMERO DO SPLIT:", split_num)
    split_num += 1
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    RCF.fit(X_train, y_train)

    y_pred = RCF.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    df_cm = pd.DataFrame(cm)

    df_cm = df_cm[[1, 0]]
    df_cm = df_cm.sort_index( ascending = False )
    print("Matriz de Confusão (linhas = real | colunas = previsto):\n", df_cm)

    # Verdadeiro positivo
    a = df_cm.iloc[0,0]
    # Falso positivo
    b = df_cm.iloc[1,0]
    # Falso negativo
    c = df_cm.iloc[0, 1]
    # Verdadeiro negativo
    d = df_cm.iloc[1, 1]

    # Calculando a acurácia do modelo
    acuracia = (a + d) / (a + b + c + d)
    print("Acuracia:", acuracia)

    # Calculando a precisão do modelo
    precisao = a / (a + b)
    print("Precisão:", precisao)

    # Calculando a sensibilidade do modelo
    sensibilidade = a / (a + c)
    print("Sensibilidade:", sensibilidade)

    # Calculando a especificidade do modelo
    especificidade = d / (b + d)
    print("Especificidade:", especificidade)

    # Calculando a área sob a curva ROC
    roc_auc = roc_auc_score(y_test, y_pred)
    print("Área sob a curva ROC:", roc_auc)

    # Gerando o gráfico da área sob a curva ROC
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

##Gerando a base de predição
RC_y_pred= RCF.predict(TRN_X)

##Salvando o modelo de predição
path = 'data'
if not os.path.exists(path):
    os.mkdir(path)
    
DF_RC_y_pred = pd.DataFrame(RC_y_pred)
DF_RC_y_pred.to_csv(f"{path}/predicted_y", sep=',', encoding='utf-8', index=False)


NÚMERO DO SPLIT: 1


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
# Classificador Rede Neural MLP

input_neuron, hidden_neuron, output_neuron = X.shape[-1], X.shape[-1]*3, 1

mlp = Sequential([
    InputLayer(input_neuron),
    Dense(hidden_neuron, activation="relu"),
    Dense(output_neuron, activation="sigmoid")
])

mlp.summary()

mlp.compile(optimizer="adam", loss="binary_crossentropy")

X = TRN_X
y = TRN_Y

kf = KFold(n_splits=10, random_state=42, shuffle=True)
split_num = 1
for train_index, test_index in kf.split(X):
    print("\nNÚMERO DO SPLIT:", split_num)
    split_num += 1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    batch_size = 8
    Log = mlp.fit(X_train, y_train, batch_size=batch_size, epochs=50, shuffle=False)

    fig, ax = plt.subplots()
    ax.plot(Log.history["loss"],'r', marker='.', label="Erro no treinamento")
    ax.legend()
    plt.show()

    y_pred = mlp.predict(X_test)

    cm = confusion_matrix(y_test, np.round(abs(y_pred)))
    df_cm = pd.DataFrame(cm)

    df_cm = df_cm[[1, 0]]
    df_cm = df_cm.sort_index( ascending = False )
    print("Matriz de Confusão (linhas = real | colunas = previsto):\n", df_cm)

    # Verdadeiro positivo
    a = df_cm.iloc[0,0]
    # Falso positivo
    b = df_cm.iloc[1,0]
    # Falso negativo
    c = df_cm.iloc[0, 1]
    # Verdadeiro negativo
    d = df_cm.iloc[1, 1]

    # Calculando a acurácia do modelo
    acuracia = (a + d) / (a + b + c + d)
    print("Acuracia:", acuracia)

    # Calculando a precisão do modelo
    precisao = a / (a + b)
    print("Precisão:", precisao)

    # Calculando a sensibilidade do modelo
    sensibilidade = a / (a + c)
    print("Sensibilidade:", sensibilidade)

    # Calculando a especificidade do modelo
    especificidade = d / (b + d)
    print("Especificidade:", especificidade)

    # Calculando a área sob a curva ROC
    roc_auc = roc_auc_score(y_test, y_pred)
    print("Área sob a curva ROC:", roc_auc)

    ##Gerando o gráfico da área sob a curva ROC
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

##Gerando a base de predição
Log_y_pred = mlp.predict(TRN_X)

##Salvando o modelo de predição
path = 'data'
if not os.path.exists(path):
    os.mkdir(path)

DF_Log_y_pred = pd.DataFrame(np.round(abs(Log_y_pred)))
DF_Log_y_pred.to_csv(f"{path}/session_predict_y", sep=',', encoding='utf-8')