In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report
import scikitplot as skplt
import matplotlib.pyplot as plt
from simpletransformers.language_representation import RepresentationModel

# Carregando base de dados  pré-processada

In [2]:
database = pd.read_csv("../../Database/dataBaseWithNER.csv")

database = database.drop(columns=["Unnamed: 0"])
database = database.dropna()
target = database["target"].values.tolist()
database

Unnamed: 0,email,target
0,start date hourahead timee cardinall hou...,0
1,service long desk price structure deal quote ...,0
2,start date cardinall hourahead timee card...,0
3,start date hourahead timee cardinall anc...,0
4,cardinall deliverable revenue management marke...,0
...,...,...
33340,bio matrix scientific group symbo bmxg p...,1
33341,cardinall step away hot naked webcam girl liv...,1
33342,need pill increase performance click seroius ...,1
33343,datee final nom inlet hpl eastrans car...,0


In [3]:
emailsText = []
for email in database["email"]:
    emailsText.append(email)

In [4]:
print(len(emailsText))

33341


# Representação vetorial GPT2

In [5]:
model=RepresentationModel(
    model_type="gpt2",
    model_name="gpt2",
    use_cuda=True,
    #fp16=True
)

vectorialRepresentation = model.encode_sentences(emailsText, combine_strategy="mean")
vectorialRepresentation.shape

Some weights of the model checkpoint at gpt2 were not used when initializing GPT2ForTextRepresentation: ['h.9.mlp.c_fc.bias', 'h.2.ln_1.bias', 'h.9.attn.c_attn.bias', 'h.3.attn.bias', 'h.11.ln_1.weight', 'h.3.ln_2.weight', 'h.3.mlp.c_fc.bias', 'h.2.ln_2.bias', 'h.1.ln_1.weight', 'h.1.ln_2.weight', 'h.9.mlp.c_fc.weight', 'h.8.attn.c_proj.bias', 'h.2.mlp.c_proj.bias', 'h.4.ln_1.bias', 'h.8.attn.c_attn.bias', 'h.10.attn.bias', 'h.9.ln_2.weight', 'h.9.mlp.c_proj.weight', 'h.6.mlp.c_fc.bias', 'h.4.mlp.c_proj.weight', 'h.2.attn.c_attn.bias', 'h.11.ln_2.bias', 'h.5.attn.c_proj.bias', 'h.1.mlp.c_proj.weight', 'h.1.mlp.c_fc.bias', 'h.0.mlp.c_proj.weight', 'h.6.attn.c_proj.weight', 'h.3.ln_1.weight', 'h.10.mlp.c_proj.bias', 'h.0.ln_1.bias', 'h.9.ln_1.bias', 'h.10.ln_2.bias', 'h.8.mlp.c_proj.weight', 'h.9.attn.c_proj.bias', 'h.1.attn.bias', 'h.4.attn.c_proj.bias', 'h.8.ln_1.bias', 'h.7.mlp.c_proj.bias', 'h.9.attn.c_attn.weight', 'h.6.ln_1.weight', 'h.5.attn.c_proj.weight', 'h.4.ln_1.weight', 'h.6

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
gpt2Dataframe = pd.DataFrame(vectorialRepresentation)
gpt2Dataframe

# Visualização de dados com TSNE

In [None]:
model = TSNE(n_components=2, random_state=0)
array_red = model.fit_transform(gpt2Dataframe)

df_tsne = pd.DataFrame(array_red)

df_tsne['Target'] = target
print(df_tsne)
df_tsne_c1 = df_tsne[df_tsne['Target'] == 0]

df_tsne_c2 = df_tsne[df_tsne['Target'] == 1]

plt.scatter(df_tsne_c1[0].array,df_tsne_c1[1].array,marker='o',color='blue')

plt.scatter(df_tsne_c2[0].array,df_tsne_c2[1].array,marker='o',color='red')

plt.title('Dados')
plt.xlabel('x')
plt.ylabel('y')

plt.show()

# Validação

In [None]:
def getModel():
    return RandomForestClassifier()

In [None]:
X_treino, X_teste, y_treino, y_teste = train_test_split(gpt2Dataframe.values,target,test_size=0.2)
modelo = getModel().fit(X_treino,y_treino)
score = modelo.score(X_teste,y_teste)
score

In [None]:
scores = cross_val_score(getModel(),gpt2Dataframe.values,target,cv=10)

scores.mean()

In [None]:
predicoes = cross_val_predict(getModel(), gpt2Dataframe.values, target, cv=10)

In [None]:
print("\nClassification Report : ")
print(classification_report(target, predicoes, target_names=["Ham", "Spam"]))

In [None]:
categories = ["Ham", "Spam"]

skplt.metrics.plot_confusion_matrix(
    [categories[i] for i in target], [categories[i] for i in predicoes.tolist()],
    title="Confusion Matrix",
    cmap="Purples",
    hide_zeros=True,
    figsize=(5,5)
)

plt.xticks()

In [None]:
skplt.metrics.plot_confusion_matrix(
    [categories[i] for i in target], [categories[i] for i in predicoes.tolist()],
    normalize=True,
    title="Confusion Matrix",
    cmap="Purples",
    hide_zeros=True,
    figsize=(5,5)
)

plt.xticks()