In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import scikitplot as skplt
import tensorflow as tf
from tensorflow import keras
import seaborn as sns

# Carregando base de dados  pré-processada

In [2]:
database = pd.read_csv("../../Database/dataBaseWithNER.csv")

database = database.drop(columns=["Unnamed: 0"])
database = database.dropna()
target = database["target"].array
database

Unnamed: 0,email,target
0,start date hourahead timee cardinall hou...,0
1,service long desk price structure deal quote ...,0
2,start date cardinall hourahead timee card...,0
3,start date hourahead timee cardinall anc...,0
4,cardinall deliverable revenue management marke...,0
...,...,...
33340,bio matrix scientific group symbo bmxg p...,1
33341,cardinall step away hot naked webcam girl liv...,1
33342,need pill increase performance click seroius ...,1
33343,datee final nom inlet hpl eastrans car...,0


In [3]:
emailsText = []
for email in database["email"]:
    emailsText.append(email)

In [4]:
print(len(emailsText))

33341


# Representação vetorial Bag of Words

In [5]:
vectorizer = CountVectorizer(max_features=2100)
XTrain = vectorizer.fit_transform(emailsText)

bag = pd.DataFrame(XTrain.toarray(),columns=vectorizer.get_feature_names())

bag



Unnamed: 0,aa,ability,able,absolutely,abuse,accept,acceptance,accepted,access,according,...,xanax,xl,xp,yahoo,year,yes,yield,yo,young,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33336,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
33337,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
33338,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33339,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Visualização de dados com TSNE

In [6]:
# model = TSNE(n_components=2, random_state=0)
# #model = PCA(n_components=50, svd_solver='full')
# array_red = model.fit_transform(bag)
#
# df_tsne = pd.DataFrame(array_red)
#
# df_tsne['Target'] = target
# df_tsne_c1 = df_tsne[df_tsne['Target'] == 0]
#
# df_tsne_c2 = df_tsne[df_tsne['Target'] == 1]
#
# plt.scatter(df_tsne_c1[0].array,df_tsne_c1[1].array,marker='o',color='blue')
#
# plt.scatter(df_tsne_c2[0].array,df_tsne_c2[1].array,marker='o',color='red')
#
# plt.title('Dados')
# plt.xlabel('x')
# plt.ylabel('y')
#
# plt.show()

# Validação

In [7]:
X_treino, X_teste, y_treino, y_teste = train_test_split(bag.values,target,test_size=0.2)

In [8]:
X_treino

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
print(X_treino.shape)

(26672, 2100)


In [10]:
y_treino

<PandasArray>
[0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
 ...
 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]
Length: 26672, dtype: int64

In [None]:
#print(bag.values)
#print(target)
model = keras.models.Sequential([
########## MLP
keras.layers.Flatten(input_shape=(bag.shape[1],)),
#keras.layers.Dense(300, activation="relu"),
keras.layers.Dense(1000, activation="relu"),
keras.layers.Dense(300, activation="relu"),
keras.layers.Dense(100, activation="relu"),

keras.layers.Dense(len(set(target)), activation="softmax")
])

In [None]:
model.summary()

In [None]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

In [None]:
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4,mode='min'), tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, verbose=0, restore_best_weights=True)]

history = model.fit(np.array(X_treino), np.array(y_treino), epochs=200,validation_data=(np.array(X_teste), np.array(y_teste)),callbacks=callbacks)

## VALIDANDO DE FORMA MAIS ELABORADA

In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1) # set the vertical range to [0-1]
plt.xlabel('epoch')
plt.show()

In [None]:
model.evaluate(np.array(X_teste), np.array(y_teste))

In [None]:
cm = confusion_matrix(target,np.argmax(model.predict(bag.values), axis=-1))

labels = ["Ham", "Spam"]

cm_df = pd.DataFrame(cm, columns=labels)


fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(cm_df, annot=True, fmt="d", xticklabels=labels, yticklabels=labels)

plt.show()

In [None]:
print(classification_report(target,np.argmax(model.predict(bag.values), axis=-1)))