In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report
import scikitplot as skplt
import matplotlib.pyplot as plt
from simpletransformers.language_representation import RepresentationModel

# Carregando base de dados  pré-processada

In [2]:
database = pd.read_csv("../../Database/dataBaseWithNER.csv")

database = database.drop(columns=["Unnamed: 0"])
database = database.dropna()
target = database["target"].values.tolist()
database

Unnamed: 0,email,target
0,start date hourahead timee cardinall hou...,0
1,service long desk price structure deal quote ...,0
2,start date cardinall hourahead timee card...,0
3,start date hourahead timee cardinall anc...,0
4,cardinall deliverable revenue management marke...,0
...,...,...
33340,bio matrix scientific group symbo bmxg p...,1
33341,cardinall step away hot naked webcam girl liv...,1
33342,need pill increase performance click seroius ...,1
33343,datee final nom inlet hpl eastrans car...,0


In [3]:
emailsText = []
for email in database["email"]:
    emailsText.append(email)

In [4]:
print(len(emailsText))

33341


# Representação vetorial GPT2

In [5]:
model=RepresentationModel(
    model_type="gpt2",
    model_name="gpt2",
    use_cuda=True,
    #fp16=True
)

vectorialRepresentation = model.encode_sentences(emailsText, combine_strategy="mean")
vectorialRepresentation.shape

Some weights of the model checkpoint at gpt2 were not used when initializing GPT2ForTextRepresentation: ['h.7.ln_2.weight', 'h.2.attn.c_proj.bias', 'h.1.ln_2.weight', 'h.8.attn.c_proj.weight', 'h.9.ln_1.weight', 'h.9.ln_2.bias', 'h.8.ln_1.weight', 'h.10.mlp.c_fc.bias', 'h.0.attn.bias', 'h.9.mlp.c_proj.weight', 'h.10.ln_2.bias', 'h.2.ln_2.bias', 'h.5.mlp.c_proj.weight', 'h.3.attn.bias', 'h.10.mlp.c_proj.weight', 'h.7.mlp.c_fc.weight', 'wpe.weight', 'h.0.attn.c_proj.bias', 'h.5.attn.c_proj.weight', 'h.1.mlp.c_fc.bias', 'h.1.ln_1.bias', 'h.1.mlp.c_fc.weight', 'h.5.attn.bias', 'h.4.ln_1.weight', 'h.6.attn.c_attn.bias', 'h.0.ln_1.bias', 'h.3.attn.c_proj.bias', 'h.3.mlp.c_proj.bias', 'h.7.ln_2.bias', 'h.9.mlp.c_fc.bias', 'h.1.mlp.c_proj.bias', 'h.1.mlp.c_proj.weight', 'h.8.ln_2.bias', 'h.4.mlp.c_proj.bias', 'h.5.attn.c_proj.bias', 'h.2.ln_1.bias', 'h.10.attn.c_attn.bias', 'h.9.attn.c_attn.bias', 'h.3.attn.c_proj.weight', 'h.4.attn.c_proj.weight', 'h.10.ln_1.weight', 'h.5.mlp.c_proj.bias', 'h

(33341, 768)

In [6]:
gpt2Dataframe = pd.DataFrame(vectorialRepresentation)
gpt2Dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-1.030083,0.109724,0.927873,-0.617342,0.543541,0.427058,-0.514676,1.694434,-0.580174,-1.091022,...,-0.498598,0.351335,0.853568,2.374959,-1.410789,1.245473,-0.473885,0.478011,0.140457,-0.619885
1,-0.092674,-0.381372,2.012866,-0.735727,0.072879,0.071605,-1.176053,1.502053,-0.769178,-1.291546,...,-0.474673,1.108720,0.889351,2.316023,0.087432,0.612741,-0.030229,0.735440,-0.392663,-0.129117
2,-1.051408,0.299784,0.841643,-0.449345,-0.329442,-0.094266,-0.336205,1.833698,-0.349068,-0.784353,...,-0.587131,0.640777,1.131895,2.433959,-2.078627,1.102487,-1.407242,0.209603,-0.571815,-0.152947
3,-0.883784,0.286621,1.016412,-0.888368,0.062395,-0.270674,-0.299684,2.073566,-0.537574,-0.802398,...,-0.680455,0.264351,0.937522,2.707597,-1.718038,1.148977,-0.955044,0.538847,-0.090273,-0.122395
4,-0.209628,0.685009,0.931885,-0.376099,0.474256,0.707136,0.144236,0.863746,-0.421887,-0.791166,...,0.548276,-0.438824,2.187341,1.540887,-0.410145,1.246987,-0.476006,-0.630435,-1.297185,-0.154112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33336,-0.513509,0.128544,2.039597,-0.299465,-0.599164,-0.393354,0.112577,1.611766,-0.406810,0.193332,...,-0.549131,0.275840,1.047210,1.753217,-1.041353,0.628145,-1.651687,0.745647,-0.047421,1.033260
33337,-0.583663,0.689026,2.111216,0.589455,-0.939575,1.173398,-0.115160,0.952676,0.079639,-0.912642,...,-0.333912,1.120153,0.981974,1.904992,-1.181392,0.934996,-0.200818,-0.540170,-0.359948,-0.219496
33338,-1.495019,0.790122,1.807736,-0.786408,-0.355426,0.124692,0.333372,1.503447,-0.867034,0.268472,...,-0.744148,0.447202,1.260685,1.905787,-0.572558,0.783855,-0.512786,-0.382856,-0.284805,-0.314551
33339,-1.155704,0.062052,0.672390,0.484694,-0.193628,-0.465796,-0.932989,1.382008,-0.089423,-0.623302,...,-0.414172,0.920319,1.126764,2.062665,-1.038254,0.336398,-1.023622,0.650219,0.000506,0.076306


# Visualização de dados com TSNE

In [7]:
model = TSNE(n_components=2, random_state=0)
array_red = model.fit_transform(gpt2Dataframe)

df_tsne = pd.DataFrame(array_red)

df_tsne['Target'] = target
print(df_tsne)
df_tsne_c1 = df_tsne[df_tsne['Target'] == 0]

df_tsne_c2 = df_tsne[df_tsne['Target'] == 1]

plt.scatter(df_tsne_c1[0].array,df_tsne_c1[1].array,marker='o',color='blue')

plt.scatter(df_tsne_c2[0].array,df_tsne_c2[1].array,marker='o',color='red')

plt.title('Dados')
plt.xlabel('x')
plt.ylabel('y')

plt.show()



MemoryError: Unable to allocate 85.1 MiB for an array with shape (3340, 3340) and data type float64

# Validação

In [None]:
def getModel():
    return DecisionTreeClassifier()

In [None]:
X_treino, X_teste, y_treino, y_teste = train_test_split(gpt2Dataframe.values,target,test_size=0.2)
modelo = getModel().fit(X_treino,y_treino)
score = modelo.score(X_teste,y_teste)
score

In [None]:
scores = cross_val_score(getModel(),gpt2Dataframe.values,target,cv=10)

scores.mean()

In [None]:
predicoes = cross_val_predict(getModel(), gpt2Dataframe.values, target, cv=10)

In [None]:
print("\nClassification Report : ")
print(classification_report(target, predicoes, target_names=["Ham", "Spam"]))

In [None]:
categories = ["Ham", "Spam"]

skplt.metrics.plot_confusion_matrix(
    [categories[i] for i in target], [categories[i] for i in predicoes.tolist()],
    title="Confusion Matrix",
    cmap="Purples",
    hide_zeros=True,
    figsize=(5,5)
)

plt.xticks()

In [None]:
skplt.metrics.plot_confusion_matrix(
    [categories[i] for i in target], [categories[i] for i in predicoes.tolist()],
    normalize=True,
    title="Confusion Matrix",
    cmap="Purples",
    hide_zeros=True,
    figsize=(5,5)
)

plt.xticks()