In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
#from nltk.corpus import stopwords
#import nltk
import re
import string
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import sys
from sklearn.pipeline import make_pipeline
import pickle

In [17]:
sys.path.append('../utils')
import functions as func

In [18]:
df_phishing = pd.read_csv('../data/phishing_email.csv')
df_phishing.head()

Unnamed: 0,text_combined,label
0,hpl nom may 25 2001 see attached file hplno 52...,0
1,nom actual vols 24 th forwarded sabrae zajac h...,0
2,enron actuals march 30 april 1 201 estimated a...,0
3,hpl nom may 30 2001 see attached file hplno 53...,0
4,hpl nom june 1 2001 see attached file hplno 60...,0


In [24]:
df_processed = func.limpieza_texto(df_phishing)
df_processed.head()

Unnamed: 0,body,label
0,hpl nom may see attached file hplno xls hpl...,0
1,nom actual vols th forwarded sabrae zajac hou...,0
2,enron actuals march april estimated actuals...,0
3,hpl nom may see attached file hplno xls hpl...,0
4,hpl nom june see attached file hplno xls hp...,0


In [20]:
func.palabras_mas_comunes(df_processed,100)

Palabras más comunes:
enron: 53886
aug: 47868
email: 42919
new: 36336
ect: 34872
submissionid: 32246
time: 30149
company: 29660
information: 27868
com: 23763
message: 23200
list: 22868
like: 22464
use: 22458
added: 21292
submission: 21137
news: 21086
subject: 20797
business: 20076
sender: 19975
total: 19955
money: 19753
notes: 19692
need: 19577
know: 19455
wed: 18614
account: 18602
mail: 18098
make: 17755
virus: 17324
university: 17069
pm: 17056
hou: 16901
thu: 16687
work: 16393
want: 16332
wrote: 15811
free: 15678
send: 15259
daily: 14964
cnncom: 14558
thanks: 14470
people: 14256
help: 14216
said: 14019
click: 13921
address: 13803
contact: 13698
data: 13542
going: 13205
network: 13191
receive: 13177
sent: 12896
best: 12891
unsubscribe: 12397
way: 12240
dont: 12078
using: 12075
gas: 12039
bank: 12019
fri: 11833
cnn: 11812
web: 11704
price: 11654
day: 11641
number: 11569
file: 11552
group: 11551
good: 11237
million: 11228
order: 11177
date: 11145
think: 10931
deal: 10916
energy: 10826
s

In [25]:
X = df_processed[['body']]
y = df_processed['label']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model naive-bayes

In [8]:
nb_classifier = make_pipeline(TfidfVectorizer(max_features=5500, stop_words='english'),MultinomialNB())
nb_classifier.fit(X_train['body'],y_train)

# Evaluación del modelo
print(classification_report(y_test, nb_classifier.predict(X_test['body'])))
print(classification_report(y_train, nb_classifier.predict(X_train['body'])))

print(accuracy_score(y_test, nb_classifier.predict(X_test['body'])))
print(accuracy_score(y_train, nb_classifier.predict(X_train['body'])))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      7935
           1       0.98      0.95      0.96      8563

    accuracy                           0.96     16498
   macro avg       0.96      0.96      0.96     16498
weighted avg       0.96      0.96      0.96     16498

              precision    recall  f1-score   support

           0       0.95      0.98      0.96     31660
           1       0.98      0.95      0.96     34328

    accuracy                           0.96     65988
   macro avg       0.96      0.96      0.96     65988
weighted avg       0.96      0.96      0.96     65988

0.9614498727118439
0.9620840152755047


### Model logistic regression

In [9]:
logistic_regression = make_pipeline(TfidfVectorizer(max_features=5500, stop_words='english'),LogisticRegression(C=5.0))
logistic_regression.fit(X_train['body'],y_train)

# Evaluación del modelo
print(classification_report(y_test, logistic_regression.predict(X_test['body'])))
print(classification_report(y_train, logistic_regression.predict(X_train['body'])))

print(accuracy_score(y_test, logistic_regression.predict(X_test['body'])))
print(accuracy_score(y_train, logistic_regression.predict(X_train['body'])))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7935
           1       0.98      0.99      0.98      8563

    accuracy                           0.98     16498
   macro avg       0.98      0.98      0.98     16498
weighted avg       0.98      0.98      0.98     16498

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     31660
           1       0.99      0.99      0.99     34328

    accuracy                           0.99     65988
   macro avg       0.99      0.99      0.99     65988
weighted avg       0.99      0.99      0.99     65988

0.983452539701782
0.9923622476813966


### Model XGBoost

In [10]:
xgb_model = make_pipeline(TfidfVectorizer(max_features=5500, stop_words='english'),xgb.XGBClassifier(learning_rate=0.7,n_estimators= 250))
xgb_model.fit(X_train['body'],y_train)

# Evaluación del modelo
print(classification_report(y_test, xgb_model.predict(X_test['body'])))
print(classification_report(y_train, xgb_model.predict(X_train['body'])))

print(accuracy_score(y_test, xgb_model.predict(X_test['body'])))
print(accuracy_score(y_train, xgb_model.predict(X_train['body'])))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      7935
           1       0.98      0.99      0.99      8563

    accuracy                           0.99     16498
   macro avg       0.99      0.99      0.99     16498
weighted avg       0.99      0.99      0.99     16498

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     31660
           1       1.00      1.00      1.00     34328

    accuracy                           1.00     65988
   macro avg       1.00      1.00      1.00     65988
weighted avg       1.00      1.00      1.00     65988

0.9867862771245
0.9985148814936049


In [12]:
# Guardar el modelo en un archivo usando pickle
with open('../model/model_NB.pkl', 'wb') as archivo:
    pickle.dump(nb_classifier, archivo)

In [13]:
# Guardar el modelo en un archivo usando pickle
with open('../model/model_log_reg.pkl', 'wb') as archivo:
    pickle.dump(logistic_regression, archivo)

In [14]:
# Guardar el modelo en un archivo usando pickle
with open('../model/model_xgb.pkl', 'wb') as archivo:
    pickle.dump(xgb_model, archivo)