In [1]:
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow import keras
import tensorflow as tf

In [2]:
database = pd.read_csv("../../Database/dataBaseWithNER.csv")

database = database.drop(columns=["Unnamed: 0"])
database = database.dropna()
target = database["target"].array
database

Unnamed: 0,email,target
0,start date hourahead timee cardinall hou...,0
1,service long desk price structure deal quote ...,0
2,start date cardinall hourahead timee card...,0
3,start date hourahead timee cardinall anc...,0
4,cardinall deliverable revenue management marke...,0
...,...,...
33340,bio matrix scientific group symbo bmxg p...,1
33341,cardinall step away hot naked webcam girl liv...,1
33342,need pill increase performance click seroius ...,1
33343,datee final nom inlet hpl eastrans car...,0


In [3]:
emailsText = []
for email in database["email"]:
    emailsText.append(email)

In [4]:
vectorizer = CountVectorizer(max_features=2100)
XTrain = vectorizer.fit_transform(emailsText)

bag = pd.DataFrame(XTrain.toarray(),columns=vectorizer.get_feature_names())

bag



Unnamed: 0,aa,ability,able,absolutely,abuse,accept,acceptance,accepted,access,according,...,xanax,xl,xp,yahoo,year,yes,yield,yo,young,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33336,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
33337,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
33338,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33339,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X_treino, X_teste, y_treino, y_teste = train_test_split(bag.values,target,test_size=0.2)

In [6]:
# Merge inputs and targets
inputs = np.concatenate((X_treino, X_teste), axis=0)
targets = np.concatenate((y_treino, y_teste), axis=0)

In [7]:
acc_per_fold = []
loss_per_fold = []

In [8]:
# Define the K-fold Cross Validator
kfold = KFold(n_splits=5, shuffle=True)

In [9]:
# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(inputs, targets):

    # Define the model architecture
    model = keras.models.Sequential([
        ########## MLP
        keras.layers.Flatten(input_shape=(X_treino.shape[1],)),
        #keras.layers.Dense(300, activation="relu"),
        keras.layers.Dense(1000, activation="relu"),
        keras.layers.Dense(300, activation="relu"),
        keras.layers.Dense(100, activation="relu"),

        keras.layers.Dense(len(set(y_treino)), activation="softmax")
    ])

    # Compile the model
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="sgd",
                  metrics=['accuracy'])


    # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')

    callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4,mode='min'), tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, verbose=0, restore_best_weights=True)]

    # Fit data to model
    history = model.fit(inputs[train], targets[train], epochs=200, callbacks=callbacks, validation_split=0.05)

    # Generate generalization metrics
    scores = model.evaluate(inputs[test], targets[test], verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])

    # Increase fold number
    fold_no = fold_no + 1

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 16: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 23: ReduceLROnPlateau reducing learning rate to 9.999999310821295e-05.
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Score for fold 1: loss of 0.08282823115587234; accuracy of 97.27095365524292%
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 14: ReduceLROnPlateau reduc

In [10]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print('------------------------------------------------------------------------')
    print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.08282823115587234 - Accuracy: 97.27095365524292%
------------------------------------------------------------------------
> Fold 2 - Loss: 0.09300064295530319 - Accuracy: 97.28554487228394%
------------------------------------------------------------------------
> Fold 3 - Loss: 0.07573442161083221 - Accuracy: 97.51049876213074%
------------------------------------------------------------------------
> Fold 4 - Loss: 0.07687965035438538 - Accuracy: 97.57048487663269%
------------------------------------------------------------------------
> Fold 5 - Loss: 0.07118797302246094 - Accuracy: 98.17036390304565%
------------------------------------------------------------------------
Average scores for all folds:
> Accuracy: 97.56156921386719 (+- 0.3267921338453489)
> Loss: 0.07992618381977082
----------------------